diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..52ea513
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,135 @@
+.vscode
+data
+outputs
+**/.ipynb_checkpoints
+*.ipynb
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..26c163f
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 Cheng Chi
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e2f6022
--- /dev/null
+++ b/README.md
@@ -0,0 +1,116 @@
+# Iterative Residual Policy
+This repository contains the source code for the paper [Iterative Residual Policy for Goal-Conditioned Dynamic Manipulation of Deformable Objects](https://irp.cs.columbia.edu/). This paper has been accepted to RSS 2022.
+
+<img src="images/teaser.jpg" alt="drawing" width="500"/>
+
+## Cite this work
+```
+@inproceedings{chi2022irp,
+  title={Iterative Residual Policy for Goal-Conditioned Dynamic Manipulation of Deformable Objects},
+  author={Chi, Cheng and Burchfiel, Benjamin and Cousineau, Eric and Feng, Siyuan and Song, Shuran},
+  booktitle={The IEEE International Conference on Computer Vision (ICCV)},
+  year={2022}
+}
+```
+
+## Datasets
+IRP Rope Dataset [required for eval] (7.63GB)
+* [Google Drive (Recommended)](https://drive.google.com/file/d/1uv7APODe6yl3vTaBLfOOz5Q9APdZdkqd/view?usp=sharing) 
+* [Columbia Server](https://irp.cs.columbia.edu/data/irp_rope.zarr.tar)
+
+## Pretrained Models
+IRP Rope [action + tracking] (914MB)
+* [Google Drive (Recommended)](https://drive.google.com/file/d/1_lODhez-JeGKbQvxHHfD3NBHBT-TSvrD/view?usp=sharing) 
+* [Columbia Server](https://irp.cs.columbia.edu/data/checkpoints.tar)
+
+## Usage
+### Installation (🖥️ Basic)
+A conda [environment.yml](./environment.yml) for `python=3.8, pytorch=1.9.0 and cudatoolkit=11.2` is provided. 
+```
+conda env create --file environment.yml
+```
+Please try [mambaforge](https://github.com/conda-forge/miniforge) for better dependency conflict resolution.
+```
+mamba env create --file environment.yml
+```
+
+### Installation (👾 Simulation)
+* Install [Mujoco 2.1.0](https://github.com/deepmind/mujoco/releases/tag/2.1.0)
+* Install [mujoco-py 2.0.2](https://github.com/openai/mujoco-py) and carefully follow instructions.
+* Install [abr_control](https://github.com/cheng-chi/abr_control) with Mujoco.
+
+### Installation (🦾 Real Robot)
+#### Hardware
+* [Stereolabs ZED 2i Camera](https://www.stereolabs.com/zed-2i/)
+* [UR5-CB3](https://www.universal-robots.com/cb3) or [UR5e](https://www.universal-robots.com/products/ur5-robot/) ([RTDE Interface](https://www.universal-robots.com/articles/ur/interface-communication/real-time-data-exchange-rtde-guide/) is required)
+* [Millibar Robotics Manual Tool Changer](https://www.millibar.com/manual-tool-changer/) (only need robot side)
+* 3D Print [Quick Change Plate](https://cad.onshape.com/documents/2298872dc9e43725186484ff/w/a74a4b142d00ea670de5fc6b/e/dcf3c1f5418eb0ffac59e848?renderMode=0&uiState=625c8355314d4c5f8c688959) to mount the wooden extension stick to EEF.
+* 3/8 inch Square Wooden Dowel
+* [8mm Cotton Rope](https://www.amazon.com/gp/product/B08TWMNV4P)
+* Wood screws
+* Duct tape 😛
+
+#### Software
+* Install [Zed SDK](https://www.stereolabs.com/developers/release/) and [pyzed](https://www.stereolabs.com/docs/app-development/python/install/)
+* Install [ur_rtde](https://sdurobotics.gitlab.io/ur_rtde/)
+
+### Evaluation (🖥️ Basic)
+Under project root (i.e. `irp/`), create `data` folder and download [IRP Rope Dataset](#datasets) as well as [Pretrained Models](#pretrained-models). Extract tar files
+
+```
+$ cd data
+$ tar -xvf checkpoints.tar
+$ tar -xvf irp_rope.zarr.tar
+```
+Activate environment
+```
+$ conda activate irp
+(irp) $
+```
+Run dataset evaluation script. Use `action.gpu_id` to select GPU on multi-GPU systems.
+```
+(irp) $ python eval_irp_rope_dataset.py action.gpu_id=0
+```
+
+A `log.pkl` file will be saved to the ad-hoc output directory created by [hydra](https://hydra.cc/docs/intro/). Add command-line argument `offline=False` to enable wandb logging (recommended).
+
+Numbers reported in our paper is generated using this method.
+
+### Evaluation (👾 Simulation)
+Extract data and checkpoints following [basic](#evaluation-🖥️-basic) (required).
+Install dependencies following [sim installation](#installation-👾-simulation).
+
+Run simulation evaluation script.
+```
+(irp) $ python eval_irp_rope_dataset.py action.gpu_id=0
+```
+
+Note that this method has not been extensively tested and is mainly provided for future development. Mujoco might crash due to numerical instability (i.e. Nan), which is better handled in the dataset.
+
+### Evaluation (🦾 Real Robot)
+Extract data and checkpoints following [basic](#evaluation-🖥️-basic) (required).
+Install dependencies following [real installation](#installation-🦾-real-robot).
+
+Initialize UR5 robot and write down `<robot_ip>`. Move robot in teach mode close to joint configuration `[-90,-70,150,-170,-90,-90]` to prevent unexpected movement.
+
+Run `ur5_camera_calibration_app` to create homography calibration file to `data/calibration.pkl`. Use `A/D` keys to move `j2` and `W/S` keys to move `j3`. Click on the tip of the wooden extension for calibration.
+```
+(irp) $ python ur5_camera_calibration_app.py --ip <robot_ip> -o data/calibration.pkl
+```
+Example for calibration result, the red and blue crosses should be fairly close.
+<img src="images/calib_example.png" alt="drawing" width="400"/>
+
+
+Run real eval script.
+```
+(irp) $ python eval_irp_rope_real.py
+```
+Result and videos will be saved to the hydra ad-hoc output directory.
+
+### Training
+To train IRP model from scratch:
+```
+(irp) $ python train_irp.py
+```
+
+In case of inaccurate tracking, use `video_labeler.py` to generate tracking labels and `train_tracker.py` to train tracking model.
diff --git a/abr_control_mod/mujoco_utils.py b/abr_control_mod/mujoco_utils.py
new file mode 100644
index 0000000..61b9db3
--- /dev/null
+++ b/abr_control_mod/mujoco_utils.py
@@ -0,0 +1,121 @@
+from typing import Sequence, Tuple
+
+import numpy as np
+import mujoco_py as mjp
+
+def get_rope_body_ids(
+        model: mjp.cymj.PyMjModel, 
+        prefix='B',
+        check_topology=True) -> np.ndarray:
+
+    rope_body_names = list()
+    name_idx_map = dict()
+    for body_name in model.body_names:
+        if body_name.startswith(prefix):
+            rope_body_names.append(body_name)
+            name_idx_map[body_name] = int(body_name.strip(prefix))
+    
+    rope_body_names = sorted(rope_body_names, key=lambda x: name_idx_map[x])
+    rope_body_ids = [model.body_name2id(x) for x in rope_body_names]
+
+    if check_topology:
+        for i in range(1, len(rope_body_ids)):
+            assert(model.body_parentid[rope_body_ids[i]] \
+                == rope_body_ids[i-1])
+    return np.array(rope_body_ids)
+
+def get_cloth_body_ids(
+        model: mjp.cymj.PyMjModel,
+        prefix='B') -> np.ndarray:
+    name_idx_map = dict()
+    imax = 0
+    jmax = 0
+    for body_name in model.body_names:
+        if body_name.startswith(prefix):
+            i, j = [int(x) for x in body_name.strip(prefix).split('_')]
+            name_idx_map[body_name] = (i,j)
+            imax = max(imax, i)
+            jmax = max(jmax, j)
+    
+    id_arr = np.zeros((imax+1, jmax+1), dtype=np.int64)
+    for key, value in name_idx_map.items():
+        body_id = model.body_name2id(key)
+        id_arr[value] = body_id
+    return id_arr
+
+def get_body_center_of_mass(
+        data: mjp.cymj.PyMjData,
+        body_ids: np.ndarray
+        ) -> np.ndarray:
+    return data.xipos[body_ids]
+
+def apply_force_com(
+        model: mjp.cymj.PyMjModel, 
+        data: mjp.cymj.PyMjData,
+        body_id: int,
+        force: np.ndarray):
+    com_point = data.xipos[body_id]
+    torque = np.zeros(3)
+    mjp.functions.mj_applyFT(
+        model, data, 
+        force, torque, com_point, 
+        body_id, data.qfrc_applied)
+
+def apply_force_com_batch(
+        model: mjp.cymj.PyMjModel, 
+        data: mjp.cymj.PyMjData,
+        body_ids: Sequence[int],
+        forces: np.ndarray):
+    assert(len(body_ids) == len(forces))
+    for i in range(len(body_ids)):
+        apply_force_com(model, data, body_ids[i], forces[i])
+
+def clear_forces(data):
+    data.qfrc_applied[:] = 0
+
+def apply_impulse_com_batch(
+        sim: mjp.cymj.MjSim,
+        body_ids: Sequence[int],
+        impulses: np.ndarray):
+    
+    dt = sim.model.opt.timestep
+    forces = impulses / dt
+    apply_force_com_batch(
+        model=sim.model, data=sim.data,
+        body_ids=body_ids,
+        forces=forces)
+    sim.step()
+    clear_forces(sim.data)
+
+def get_rope_dof_idxs(
+        model: mjp.cymj.PyMjModel, 
+        prefix='B',
+        check_topology=True
+        ) -> Tuple[np.ndarray, np.ndarray]:
+    rope_body_ids = get_rope_body_ids(
+        model, prefix, check_topology)
+    
+    dof_body_ids = list()
+    dof_idxs = list()
+    for body_id in rope_body_ids:
+        num_dof = model.body_dofnum[body_id]
+        if num_dof == 0:
+            continue
+        assert(num_dof == 2)
+        dof_adr = model.body_dofadr[body_id]
+        idxs = [dof_adr + i for i in range(num_dof)]
+        dof_idxs.append(idxs)
+        dof_body_ids.append(body_id)
+    return np.array(dof_idxs), np.array(dof_body_ids)
+
+def get_mujoco_state(sim):
+    q = np.copy(sim.data.qpos)
+    dq = np.copy(sim.data.qvel)
+    u = np.copy(sim.data.ctrl)
+    return (q, dq, u)
+
+def set_mujoco_state(sim, state):
+    q, dq, u = state
+    sim.data.qpos[:] = q
+    sim.data.qvel[:] = dq
+    sim.data.ctrl[:] = u
diff --git a/assets/mujoco/ur5/meshes/floor_tile.png b/assets/mujoco/ur5/meshes/floor_tile.png
new file mode 100644
index 0000000..768943a
Binary files /dev/null and b/assets/mujoco/ur5/meshes/floor_tile.png differ
diff --git a/assets/mujoco/ur5/meshes/link0.stl b/assets/mujoco/ur5/meshes/link0.stl
new file mode 100644
index 0000000..72d25c1
Binary files /dev/null and b/assets/mujoco/ur5/meshes/link0.stl differ
diff --git a/assets/mujoco/ur5/meshes/link1.stl b/assets/mujoco/ur5/meshes/link1.stl
new file mode 100644
index 0000000..044810d
Binary files /dev/null and b/assets/mujoco/ur5/meshes/link1.stl differ
diff --git a/assets/mujoco/ur5/meshes/link1_cap.stl b/assets/mujoco/ur5/meshes/link1_cap.stl
new file mode 100644
index 0000000..26d044a
Binary files /dev/null and b/assets/mujoco/ur5/meshes/link1_cap.stl differ
diff --git a/assets/mujoco/ur5/meshes/link1_connector.stl b/assets/mujoco/ur5/meshes/link1_connector.stl
new file mode 100644
index 0000000..7080999
Binary files /dev/null and b/assets/mujoco/ur5/meshes/link1_connector.stl differ
diff --git a/assets/mujoco/ur5/meshes/link2.stl b/assets/mujoco/ur5/meshes/link2.stl
new file mode 100644
index 0000000..a3e7a90
Binary files /dev/null and b/assets/mujoco/ur5/meshes/link2.stl differ
diff --git a/assets/mujoco/ur5/meshes/link2_cap.stl b/assets/mujoco/ur5/meshes/link2_cap.stl
new file mode 100644
index 0000000..0df2c59
Binary files /dev/null and b/assets/mujoco/ur5/meshes/link2_cap.stl differ
diff --git a/assets/mujoco/ur5/meshes/link2_connector.stl b/assets/mujoco/ur5/meshes/link2_connector.stl
new file mode 100644
index 0000000..565f2e9
Binary files /dev/null and b/assets/mujoco/ur5/meshes/link2_connector.stl differ
diff --git a/assets/mujoco/ur5/meshes/link2_tube.stl b/assets/mujoco/ur5/meshes/link2_tube.stl
new file mode 100644
index 0000000..c26fa7d
Binary files /dev/null and b/assets/mujoco/ur5/meshes/link2_tube.stl differ
diff --git a/assets/mujoco/ur5/meshes/link3_cap.stl b/assets/mujoco/ur5/meshes/link3_cap.stl
new file mode 100644
index 0000000..bd9144a
Binary files /dev/null and b/assets/mujoco/ur5/meshes/link3_cap.stl differ
diff --git a/assets/mujoco/ur5/meshes/link3_tube.stl b/assets/mujoco/ur5/meshes/link3_tube.stl
new file mode 100644
index 0000000..614d550
Binary files /dev/null and b/assets/mujoco/ur5/meshes/link3_tube.stl differ
diff --git a/assets/mujoco/ur5/meshes/link3a.stl b/assets/mujoco/ur5/meshes/link3a.stl
new file mode 100644
index 0000000..d37120f
Binary files /dev/null and b/assets/mujoco/ur5/meshes/link3a.stl differ
diff --git a/assets/mujoco/ur5/meshes/link3a_connector.stl b/assets/mujoco/ur5/meshes/link3a_connector.stl
new file mode 100644
index 0000000..2419fcd
Binary files /dev/null and b/assets/mujoco/ur5/meshes/link3a_connector.stl differ
diff --git a/assets/mujoco/ur5/meshes/link3b.stl b/assets/mujoco/ur5/meshes/link3b.stl
new file mode 100644
index 0000000..9eb5c94
Binary files /dev/null and b/assets/mujoco/ur5/meshes/link3b.stl differ
diff --git a/assets/mujoco/ur5/meshes/link3b_connector.stl b/assets/mujoco/ur5/meshes/link3b_connector.stl
new file mode 100644
index 0000000..e3adb75
Binary files /dev/null and b/assets/mujoco/ur5/meshes/link3b_connector.stl differ
diff --git a/assets/mujoco/ur5/meshes/link3b_connector2.stl b/assets/mujoco/ur5/meshes/link3b_connector2.stl
new file mode 100644
index 0000000..79e3324
Binary files /dev/null and b/assets/mujoco/ur5/meshes/link3b_connector2.stl differ
diff --git a/assets/mujoco/ur5/meshes/link4.stl b/assets/mujoco/ur5/meshes/link4.stl
new file mode 100644
index 0000000..250d0ed
Binary files /dev/null and b/assets/mujoco/ur5/meshes/link4.stl differ
diff --git a/assets/mujoco/ur5/meshes/link4_cap.stl b/assets/mujoco/ur5/meshes/link4_cap.stl
new file mode 100644
index 0000000..d7bb39f
Binary files /dev/null and b/assets/mujoco/ur5/meshes/link4_cap.stl differ
diff --git a/assets/mujoco/ur5/meshes/link4_connector.stl b/assets/mujoco/ur5/meshes/link4_connector.stl
new file mode 100644
index 0000000..73c7e9c
Binary files /dev/null and b/assets/mujoco/ur5/meshes/link4_connector.stl differ
diff --git a/assets/mujoco/ur5/meshes/link5.stl b/assets/mujoco/ur5/meshes/link5.stl
new file mode 100644
index 0000000..3b40138
Binary files /dev/null and b/assets/mujoco/ur5/meshes/link5.stl differ
diff --git a/assets/mujoco/ur5/meshes/link5_cap.stl b/assets/mujoco/ur5/meshes/link5_cap.stl
new file mode 100644
index 0000000..57475e4
Binary files /dev/null and b/assets/mujoco/ur5/meshes/link5_cap.stl differ
diff --git a/assets/mujoco/ur5/meshes/link5_connector.stl b/assets/mujoco/ur5/meshes/link5_connector.stl
new file mode 100644
index 0000000..9de33ab
Binary files /dev/null and b/assets/mujoco/ur5/meshes/link5_connector.stl differ
diff --git a/assets/mujoco/ur5/meshes/link6.stl b/assets/mujoco/ur5/meshes/link6.stl
new file mode 100644
index 0000000..670bcaf
Binary files /dev/null and b/assets/mujoco/ur5/meshes/link6.stl differ
diff --git a/assets/mujoco/ur5/meshes/link6_connector.stl b/assets/mujoco/ur5/meshes/link6_connector.stl
new file mode 100644
index 0000000..a593ec0
Binary files /dev/null and b/assets/mujoco/ur5/meshes/link6_connector.stl differ
diff --git a/assets/mujoco/ur5/meshes/ur5.skp b/assets/mujoco/ur5/meshes/ur5.skp
new file mode 100644
index 0000000..f02e8bc
Binary files /dev/null and b/assets/mujoco/ur5/meshes/ur5.skp differ
diff --git a/assets/mujoco/ur5/ur5_rope.xml b/assets/mujoco/ur5/ur5_rope.xml
new file mode 100644
index 0000000..2e15e0a
--- /dev/null
+++ b/assets/mujoco/ur5/ur5_rope.xml
@@ -0,0 +1,171 @@
+<!-- NOTE: the joint limits and references have not been set. If using -->
+<!-- this to control a real ur5, make sure they align with the values -->
+<!-- used in the real arm firmware -->
+<!-- Masses and center-of-masses taken from: www.universal-robots.com/
+        how-tos-and-faqs/faq/ur-faq/
+        parameters-for-calculations-of-kinematics-and-dynamics-45257/ -->
+<!-- Principle moments of inertia not provided, set all to 0.1. -->
+<!-- Inertia parameters provided at link below, not sure of accuracy so not used:
+        http://hades.mech.northwestern.edu/images/4/49/UR5-URDF.pdf -->
+<mujoco model="ur5">
+    <compiler angle="radian" meshdir="meshes" balanceinertia="true"/>
+
+    <default>
+        <light castshadow="false" diffuse="1 1 1" />
+        <camera fovy="60" />
+    </default>
+
+    <custom>
+        <numeric name="START_ANGLES" data="0 -.67 -.67 0 0 0"/>
+        <numeric name="N_GRIPPER_JOINTS" data="0" />
+        <text name="google_id" data="1SjWRUl-D1FZ5fB2cy4jF4X9wTsQ5LWzo"/>
+    </custom>
+
+    <asset>
+        <!-- arm models -->
+        <mesh file="link0.stl"/>
+        <mesh file="link1.stl"/>
+        <mesh file="link1_cap.stl" />
+        <mesh file="link1_connector.stl" />
+        <mesh file="link2.stl" />
+        <mesh file="link2_tube.stl" />
+        <mesh file="link2_cap.stl" />
+        <mesh file="link2_connector.stl" />
+        <mesh file="link3a.stl" />
+        <mesh file="link3a_connector.stl" />
+        <mesh file="link3_tube.stl" />
+        <mesh file="link3_cap.stl" />
+        <mesh file="link3b.stl" />
+        <mesh file="link3b_connector.stl" />
+        <mesh file="link3b_connector2.stl"/>
+        <mesh file="link4.stl" />
+        <mesh file="link4_cap.stl" />
+        <mesh file="link4_connector.stl" />
+        <mesh file="link5.stl" />
+        <mesh file="link5_cap.stl" />
+        <mesh file="link5_connector.stl" />
+        <mesh file="link6.stl" />
+        <mesh file="link6_connector.stl" />
+
+        <texture name="floor_tile" file="meshes/floor_tile.png" width="3000" height="3000" type="2d"/>
+        <material name="tile" texture="floor_tile" specular="0" shininess="0" reflectance="0" emission="1" />
+        <material name="silver" specular="1" shininess="0.5" reflectance="1" emission="0.25" rgba="0.82 0.82 0.82 1"/>
+        <material name="grey" specular="1" shininess="0.5" reflectance="1" emission="0.1" rgba="0.5 0.5 0.5 1"/>
+        <material name="blue" specular="1" shininess="0.5" reflectance="1" emission="0" rgba="0.678 0.847 0.902 1"/>
+        <material name="black" specular="1" shininess="0.5" reflectance="1" emission="0" rgba="0.2 0.2 0.2 1"/>
+    </asset>
+
+    <worldbody>
+        <!-- <geom name="floor_grey_tile" pos="0 0 -1" size="0.75 0.75 0.05" type="plane" material="tile"/> -->
+
+        <body name="hand" pos="0 0 -0.15" mocap="true">
+            <geom type="box" size=".002 .004 .006" rgba="0 .9 0 .5" contype="2" conaffinity="2"/>
+        </body>
+
+        <!-- <body name="target" pos="0 0 -0.15" mocap="true">
+            <geom name="target" type="sphere" size="0.05" rgba=".9 0 0 .5" contype="4" conaffinity="4"/>
+        </body>
+
+        <body name="target_orientation" pos="0 0 -0.15" mocap="true">
+            <geom type="box" size=".02 .04 .06" rgba=".9 0 0 .5" contype="8" conaffinity="8"/>
+        </body> -->
+
+        <!-- <body name="path_planner" pos="0 0 -0.15" mocap="true">
+            <geom type="sphere" size="0.05" rgba="0 1 1 0.5" contype="16" conaffinity="16"/>
+        </body>
+
+        <body name="path_planner_orientation" pos="0 0 -0.15" mocap="true">
+            <geom type="box" size=".02 .04 .06" rgba="0 1 1 .5" contype="16" conaffinity="16"/>
+        </body> -->
+
+        <!-- <body name="obstacle" pos="0 0 -0.15" mocap="true">
+            <geom type="sphere" size="0.05" rgba="0 0 1 1"/>
+        </body> -->
+
+        <light directional="true" pos="-0.5 0.5 3" dir="0 0 -1" />
+
+
+        <body name="base_link" pos="0 0 0">
+            <geom name="link0" type="mesh" mesh="link0" pos="0 0 0"/>
+
+            <body name="link1" pos="0 0 0.0213">
+                <joint name="joint0" axis="0 0 1" pos="0 0 0"/>
+                <inertial pos="0 -0.02561 0.00193" mass="3.761" diaginertia="0.1 0.1 0.1"/>
+                <geom name="link1_connector" type="mesh" mesh="link1_connector" pos="0 0 0" material="black"
+                    conaffinity="32" contype="32"/>
+                <geom name="link1" type="mesh" mesh="link1" pos="0 0 0.008" material="grey"/>
+                <geom name="link1_cap" type="mesh" mesh="link1_cap" pos="0 0 .1025" material="blue"/>
+
+                <body name="link2" pos="-0.0663 0 0.0679">
+                    <joint name="joint1" axis="-1 0 0" pos="0 0 0"/>
+                    <inertial pos="-0.04706 0 0.2125" mass="8.058" diaginertia="0.1 0.1 0.1"/>
+                    <geom name="link2_connector" type="mesh" mesh="link2_connector" pos="0 0 0" material="black"
+                        conaffinity="32" contype="32"/>
+                    <geom name="link2" type="mesh" mesh="link2" pos="-0.008 0 0" material="grey"/>
+                    <geom name="link2_cap" type="mesh" mesh="link2_cap" pos="-0.1015 0 -0.0025" material="blue"/>
+                    <geom name="link2_tube" type="mesh" mesh="link2_tube" pos="-0.0679 0 0.066" material="silver"/>
+
+                    <body name="link3" pos="-0.008 0 0.425">
+                        <joint name="joint2" axis="1 0 0" pos="0 0 0"/>
+                        <inertial pos="0.0478 0 0.15" mass="2.846" diaginertia="0.1 0.1 0.1"/>
+                        <geom name="link3a_connector" type="mesh" mesh="link3a_connector" pos="0 0 0" material="black"
+                            conaffinity="32" contype="32"/>
+                        <geom name="link3a" type="mesh" mesh="link3a" pos="0.008 0 0" material="grey"/>
+                        <geom name="link3_tube" type="mesh" mesh="link3_tube" pos="0.059 0 0.066" material="silver"/>
+                        <geom name="link3b_connector" type="mesh" mesh="link3b_connector" pos="0.059 0 0.3418" material="black" />
+                        <geom name="link3b_connector2" type="mesh" mesh="link3b_connector2" pos="0.059 0 0.34" material="grey"
+                            conaffinity="32" contype="32"/>
+                        <geom name="link3b" type="mesh" mesh="link3b" pos="0.059 0 0.347" material="grey"/>
+                        <geom name="link3_cap" type="mesh" mesh="link3_cap" pos="0.0837 0 0.392" material="blue"/>
+
+                        <body name="link4" pos="0.0173 0 0.3922">
+                            <joint name="joint3" axis="-1 0 0" pos="0 0 0"/>
+                            <inertial pos="-0.05796 -0.0018 0" mass="1.37" diaginertia="0.1 0.1 0.1"/>
+                            <geom name="link4_connector" type="mesh" mesh="link4_connector" pos="0 0 0" material="black"
+                                conaffinity="32" contype="32"/>
+                            <geom name="link4" type="mesh" mesh="link4" pos="-0.008 0 0" material="grey"/>
+                            <geom name="link4_cap" type="mesh" mesh="link4_cap" pos="-0.05325 0 -0.0247" material="blue"/>
+
+                            <body name="link5" pos="-0.05325 0 0.04165">
+                                <joint name="joint4" axis="0 0 1" pos="0 0 0"/>
+                                <inertial pos="0.0018 0 0.01634" mass="1.3" diaginertia="0.1 0.1 0.1"/>
+                                <geom name="link5_connector" type="mesh" mesh="link5_connector" pos="0 0 0" material="black"
+                                    conaffinity="32" contype="32"/>
+                                <geom name="link5" type="mesh" mesh="link5" pos="0 0 0.0079" material="grey"/>
+                                <geom name="link5_cap" type="mesh" mesh="link5_cap" pos="0.025 0 0.053" material="blue"/>
+
+                                <body name="link6" pos="-0.04165 0 0.05305">
+                                    <joint name="joint5" axis="-1 0 0" pos="0 0 0"/>
+                                    <inertial pos="-0.038841 0 0" mass="0.365" diaginertia="0.1 0.1 0.1"/>
+                                    <geom name="link6_connector" type="mesh" mesh="link6_connector" pos="0 0 0" material="black"
+                                        conaffinity="32" contype="32"/>
+                                    <geom name="link6" type="mesh" mesh="link6" pos="-0.008 0 0" material="silver"/>
+
+                                    <body name="EE" pos="-0.04 0 0" euler="0 0 0">
+                                        <body name="B0" pos="-0.03 0 0" euler="0 3.14 0">
+                                            <!-- <freejoint/> -->
+                                            <composite type="rope" count="31 1 1" spacing="0.04" offset="0 0 2">
+                                                <joint kind="main" damping="0.005"/>
+                                                <geom type="capsule" size=".01 .015" rgba=".8 .2 .1 1" mass="0.005"/>
+                                            </composite>
+                                        </body>
+                                    </body>
+                                </body>
+                            </body>
+                        </body>
+                    </body>
+                </body>
+            </body>
+        </body>
+    </worldbody>
+
+    <actuator>
+        <motor name="joint0_motor" joint="joint0"/>
+        <motor name="joint1_motor" joint="joint1"/>
+        <motor name="joint2_motor" joint="joint2"/>
+        <motor name="joint3_motor" joint="joint3"/>
+        <motor name="joint4_motor" joint="joint4"/>
+        <motor name="joint5_motor" joint="joint5"/>
+    </actuator>
+
+</mujoco>
diff --git a/assets/mujoco/ur5/ur5_stick_rope_v2.xml.jinja2 b/assets/mujoco/ur5/ur5_stick_rope_v2.xml.jinja2
new file mode 100644
index 0000000..8d52e9a
--- /dev/null
+++ b/assets/mujoco/ur5/ur5_stick_rope_v2.xml.jinja2
@@ -0,0 +1,175 @@
+<!-- NOTE: the joint limits and references have not been set. If using -->
+<!-- this to control a real ur5, make sure they align with the values -->
+<!-- used in the real arm firmware -->
+<!-- Masses and center-of-masses taken from: www.universal-robots.com/
+        how-tos-and-faqs/faq/ur-faq/
+        parameters-for-calculations-of-kinematics-and-dynamics-45257/ -->
+<!-- Principle moments of inertia not provided, set all to 0.1. -->
+<!-- Inertia parameters provided at link below, not sure of accuracy so not used:
+        http://hades.mech.northwestern.edu/images/4/49/UR5-URDF.pdf -->
+<mujoco model="ur5">
+    <compiler angle="radian" meshdir="meshes" balanceinertia="true"/>
+    <option density="1.2" viscosity="0.00002"/>
+
+    <default>
+        <light castshadow="false" diffuse="1 1 1" />
+        <camera fovy="60" />
+    </default>
+
+    <custom>
+        <numeric name="START_ANGLES" data="0 -.67 -.67 0 0 0"/>
+        <numeric name="N_GRIPPER_JOINTS" data="0" />
+        <text name="google_id" data="1SjWRUl-D1FZ5fB2cy4jF4X9wTsQ5LWzo"/>
+    </custom>
+
+    <asset>
+        <!-- arm models -->
+        <mesh file="link0.stl"/>
+        <mesh file="link1.stl"/>
+        <mesh file="link1_cap.stl" />
+        <mesh file="link1_connector.stl" />
+        <mesh file="link2.stl" />
+        <mesh file="link2_tube.stl" />
+        <mesh file="link2_cap.stl" />
+        <mesh file="link2_connector.stl" />
+        <mesh file="link3a.stl" />
+        <mesh file="link3a_connector.stl" />
+        <mesh file="link3_tube.stl" />
+        <mesh file="link3_cap.stl" />
+        <mesh file="link3b.stl" />
+        <mesh file="link3b_connector.stl" />
+        <mesh file="link3b_connector2.stl"/>
+        <mesh file="link4.stl" />
+        <mesh file="link4_cap.stl" />
+        <mesh file="link4_connector.stl" />
+        <mesh file="link5.stl" />
+        <mesh file="link5_cap.stl" />
+        <mesh file="link5_connector.stl" />
+        <mesh file="link6.stl" />
+        <mesh file="link6_connector.stl" />
+
+        <texture name="floor_tile" file="meshes/floor_tile.png" width="3000" height="3000" type="2d"/>
+        <material name="tile" texture="floor_tile" specular="0" shininess="0" reflectance="0" emission="1" />
+        <material name="silver" specular="1" shininess="0.5" reflectance="1" emission="0.25" rgba="0.82 0.82 0.82 1"/>
+        <material name="grey" specular="1" shininess="0.5" reflectance="1" emission="0.1" rgba="0.5 0.5 0.5 1"/>
+        <material name="blue" specular="1" shininess="0.5" reflectance="1" emission="0" rgba="0.678 0.847 0.902 1"/>
+        <material name="black" specular="1" shininess="0.5" reflectance="1" emission="0" rgba="0.2 0.2 0.2 1"/>
+    </asset>
+
+    <worldbody>
+        <!-- <geom name="floor_grey_tile" pos="0 0 -0.9" size="0.75 0.75 0.05" type="plane" material="tile"/> -->
+
+        <body name="hand" pos="0 0 0" mocap="true">
+            <geom type="box" size=".002 .004 .006" rgba="0 .9 0 .5" contype="2" conaffinity="2"/>
+        </body>
+
+        <!-- <body name="target" pos="0 0 -0.15" mocap="true">
+            <geom name="target" type="sphere" size="0.05" rgba=".9 0 0 .5" contype="4" conaffinity="4"/>
+        </body>
+
+        <body name="target_orientation" pos="0 0 -0.15" mocap="true">
+            <geom type="box" size=".02 .04 .06" rgba=".9 0 0 .5" contype="8" conaffinity="8"/>
+        </body> -->
+
+        <!-- <body name="path_planner" pos="0 0 -0.15" mocap="true">
+            <geom type="sphere" size="0.05" rgba="0 1 1 0.5" contype="16" conaffinity="16"/>
+        </body>
+
+        <body name="path_planner_orientation" pos="0 0 -0.15" mocap="true">
+            <geom type="box" size=".02 .04 .06" rgba="0 1 1 .5" contype="16" conaffinity="16"/>
+        </body> -->
+
+        <!-- <body name="obstacle" pos="0 0 -0.15" mocap="true">
+            <geom type="sphere" size="0.05" rgba="0 0 1 1"/>
+        </body> -->
+
+        <light directional="true" pos="-0.5 0.5 3" dir="0 0 -1" />
+
+
+        <body name="base_link" pos="0 0 0">
+            <geom name="link0" type="mesh" mesh="link0" pos="0 0 0"/>
+
+            <body name="link1" pos="0 0 0.0213" euler="0 0 1.571">
+                <joint name="joint0" axis="0 0 1" pos="0 0 0" range="-6.283 6.283" armature="1000.0"/>
+                <inertial pos="0 -0.02561 0.00193" mass="3.761" diaginertia="0.1 0.1 0.1"/>
+                <geom name="link1_connector" type="mesh" mesh="link1_connector" pos="0 0 0" material="black"
+                    conaffinity="32" contype="32"/>
+                <geom name="link1" type="mesh" mesh="link1" pos="0 0 0.008" material="grey"/>
+                <geom name="link1_cap" type="mesh" mesh="link1_cap" pos="0 0 .1025" material="blue"/>
+
+                <body name="link2" pos="-0.0663 0 0.0679" euler="-1.571 0 0">
+                    <joint name="joint1" axis="-1 0 0" pos="0 0 0" range="-6.283 6.283" armature="1000.0"/>
+                    <inertial pos="-0.04706 0 0.2125" mass="8.058" diaginertia="0.1 0.1 0.1"/>
+                    <geom name="link2_connector" type="mesh" mesh="link2_connector" pos="0 0 0" material="black"
+                        conaffinity="32" contype="32"/>
+                    <geom name="link2" type="mesh" mesh="link2" pos="-0.008 0 0" material="grey"/>
+                    <geom name="link2_cap" type="mesh" mesh="link2_cap" pos="-0.1015 0 -0.0025" material="blue"/>
+                    <geom name="link2_tube" type="mesh" mesh="link2_tube" pos="-0.0679 0 0.066" material="silver"/>
+
+                    <body name="link3" pos="-0.008 0 0.425">
+                        <joint name="joint2" axis="-1 0 0" pos="0 0 0" range="-6.283 6.283" armature="1000.0"/>
+                        <inertial pos="0.0478 0 0.15" mass="2.846" diaginertia="0.1 0.1 0.1"/>
+                        <geom name="link3a_connector" type="mesh" mesh="link3a_connector" pos="0 0 0" material="black"
+                            conaffinity="32" contype="32"/>
+                        <geom name="link3a" type="mesh" mesh="link3a" pos="0.008 0 0" material="grey"/>
+                        <geom name="link3_tube" type="mesh" mesh="link3_tube" pos="0.059 0 0.066" material="silver"/>
+                        <geom name="link3b_connector" type="mesh" mesh="link3b_connector" pos="0.059 0 0.3418" material="black" />
+                        <geom name="link3b_connector2" type="mesh" mesh="link3b_connector2" pos="0.059 0 0.34" material="grey"
+                            conaffinity="32" contype="32"/>
+                        <geom name="link3b" type="mesh" mesh="link3b" pos="0.059 0 0.347" material="grey"/>
+                        <geom name="link3_cap" type="mesh" mesh="link3_cap" pos="0.0837 0 0.392" material="blue"/>
+
+                        <body name="link4" pos="0.0173 0 0.3922" euler="-1.571 0 0">
+                            <joint name="joint3" axis="-1 0 0" pos="0 0 0" range="-6.283 6.283" armature="1000.0"/>
+                            <inertial pos="-0.05796 -0.0018 0" mass="1.37" diaginertia="0.1 0.1 0.1"/>
+                            <geom name="link4_connector" type="mesh" mesh="link4_connector" pos="0 0 0" material="black"
+                                conaffinity="32" contype="32"/>
+                            <geom name="link4" type="mesh" mesh="link4" pos="-0.008 0 0" material="grey"/>
+                            <geom name="link4_cap" type="mesh" mesh="link4_cap" pos="-0.05325 0 -0.0247" material="blue"/>
+
+                            <body name="link5" pos="-0.05325 0 0.04165">
+                                <joint name="joint4" axis="0 0 1" pos="0 0 0" range="-6.283 6.283" armature="1000.0"/>
+                                <inertial pos="0.0018 0 0.01634" mass="1.3" diaginertia="0.1 0.1 0.1"/>
+                                <geom name="link5_connector" type="mesh" mesh="link5_connector" pos="0 0 0" material="black"
+                                    conaffinity="32" contype="32"/>
+                                <geom name="link5" type="mesh" mesh="link5" pos="0 0 0.0079" material="grey"/>
+                                <geom name="link5_cap" type="mesh" mesh="link5_cap" pos="0.025 0 0.053" material="blue"/>
+
+                                <body name="link6" pos="-0.04165 0 0.05305">
+                                    <joint name="joint5" axis="-1 0 0" pos="0 0 0" range="-6.283 6.283" armature="1000.0"/>
+                                    <inertial pos="-0.038841 0 0" mass="0.365" diaginertia="0.1 0.1 0.1"/>
+                                    <geom name="link6_connector" type="mesh" mesh="link6_connector" pos="0 0 0" material="black"
+                                        conaffinity="32" contype="32"/>
+                                    <geom name="link6" type="mesh" mesh="link6" pos="-0.008 0 0" material="silver"/>
+                                    
+                                    <body name="stick" pos="-0.05 0 0" euler="0 1.571 0">
+                                        <geom type="capsule" size=".01 {{ stick_size }}" pos="0 0 -{{ stick_size }}" rgba=".16 .65 .16 1" mass='0.1'/>
+                                        
+                                        <body name="EE" pos="0 0 -{{ ee_offset }}" euler="0 0 0">
+                                            <body name="B0" pos="0 0 -0.01" euler="0 1.571 0">
+                                                <composite type="rope" count="{{ count }} 1 1" spacing="{{ spacing }}">
+                                                    <joint kind="main" stiffness="{{ stiffness }}" damping="{{ damping }}"/>
+                                                    <geom type="capsule" size="{{ link_width }} {{ link_size }}" rgba=".8 .2 .1 1" mass="{{ link_mass }}"/>
+                                                </composite>
+                                            </body>
+                                        </body>
+                                    </body>
+                                </body>
+                            </body>
+                        </body>
+                    </body>
+                </body>
+            </body>
+        </body>
+    </worldbody>
+
+    <actuator>
+        <motor name="joint0_motor" joint="joint0"/>
+        <motor name="joint1_motor" joint="joint1"/>
+        <motor name="joint2_motor" joint="joint2"/>
+        <motor name="joint3_motor" joint="joint3"/>
+        <motor name="joint4_motor" joint="joint4"/>
+        <motor name="joint5_motor" joint="joint5"/>
+    </actuator>
+
+</mujoco>
diff --git a/common/async_dataloader.py b/common/async_dataloader.py
new file mode 100644
index 0000000..61b7e79
--- /dev/null
+++ b/common/async_dataloader.py
@@ -0,0 +1,118 @@
+import collections.abc as container_abcs
+import re
+from queue import Queue
+from threading import Thread
+from typing import Any, Optional, Union
+
+import torch
+from torch import Tensor
+from torch._six import string_classes
+from torch.utils.data import DataLoader, Dataset
+
+
+class AsynchronousLoader:
+    """Class for asynchronously loading from CPU memory to device memory with DataLoader.
+
+    Note that this only works for single GPU training, multiGPU uses PyTorch's DataParallel or
+    DistributedDataParallel which uses its own code for transferring data across GPUs. This could just
+    break or make things slower with DataParallel or DistributedDataParallel.
+
+    Args:
+        data: The PyTorch Dataset or DataLoader we're using to load.
+        device: The PyTorch device we are loading to
+        q_size: Size of the queue used to store the data loaded to the device
+        num_batches: Number of batches to load. This must be set if the dataloader
+            doesn't have a finite __len__. It will also override DataLoader.__len__
+            if set and DataLoader has a __len__. Otherwise it can be left as None
+        **kwargs: Any additional arguments to pass to the dataloader if we're
+            constructing one here
+    """
+
+    def __init__(
+        self,
+        data: Union[DataLoader, Dataset],
+        device: torch.device,
+        q_size: int = 10,
+        num_batches: Optional[int] = None,
+        **kwargs: Any,
+    ) -> None:
+        if isinstance(data, torch.utils.data.DataLoader):
+            self.dataloader = data
+        else:
+            self.dataloader = DataLoader(data, **kwargs)
+
+        if num_batches is not None:
+            self.num_batches = num_batches
+        elif hasattr(self.dataloader, "__len__"):
+            self.num_batches = len(self.dataloader)
+        else:
+            raise Exception("num_batches must be specified or data must have finite __len__")
+
+        self.device = device
+        self.q_size = q_size
+
+        self.load_stream = torch.cuda.Stream(device=device)
+        self.queue: Queue = Queue(maxsize=self.q_size)
+
+        self.idx = 0
+
+        self.np_str_obj_array_pattern = re.compile(r"[SaUO]")
+
+    def load_loop(self) -> None:  # The loop that will load into the queue in the background
+        for i, sample in enumerate(self.dataloader):
+            self.queue.put(self.load_instance(sample))
+            if i == len(self):
+                break
+
+    # Recursive loading for each instance based on torch.utils.data.default_collate
+    def load_instance(self, sample: Any) -> Any:
+        elem_type = type(sample)
+
+        if torch.is_tensor(sample):
+            with torch.cuda.stream(self.load_stream):
+                # Can only do asynchronous transfer if we use pin_memory
+                if not sample.is_pinned():
+                    sample = sample.pin_memory()
+                return sample.to(self.device, non_blocking=True)
+        elif elem_type.__module__ == "numpy" and elem_type.__name__ != "str_" and elem_type.__name__ != "string_":
+            if elem_type.__name__ == "ndarray" and self.np_str_obj_array_pattern.search(sample.dtype.str) is not None:
+                return self.load_instance(sample)
+            return self.load_instance(torch.as_tensor(sample))
+        elif isinstance(sample, container_abcs.Mapping):
+            return {key: self.load_instance(sample[key]) for key in sample}
+        elif isinstance(sample, tuple) and hasattr(sample, "_fields"):  # namedtuple
+            return elem_type(*(self.load_instance(d) for d in sample))
+        elif isinstance(sample, container_abcs.Sequence) and not isinstance(sample, string_classes):
+            return [self.load_instance(s) for s in sample]
+        else:
+            return sample
+
+    def __iter__(self) -> "AsynchronousLoader":
+        # We don't want to run the thread more than once
+        # Start a new thread if we are at the beginning of a new epoch, and our current worker is dead
+
+        if_worker = not hasattr(self, "worker") or not self.worker.is_alive()  # type: ignore[has-type]
+        if if_worker and self.queue.empty() and self.idx == 0:
+            self.worker = Thread(target=self.load_loop)
+            self.worker.daemon = True
+            self.worker.start()
+        return self
+
+    def __next__(self) -> Tensor:
+        # If we've reached the number of batches to return
+        # or the queue is empty and the worker is dead then exit
+        done = not self.worker.is_alive() and self.queue.empty()
+        done = done or self.idx >= len(self)
+        if done:
+            self.idx = 0
+            self.queue.join()
+            self.worker.join()
+            raise StopIteration
+        # Otherwise return the next batch
+        out = self.queue.get()
+        self.queue.task_done()
+        self.idx += 1
+        return out
+
+    def __len__(self) -> int:
+        return self.num_batches
diff --git a/common/cv_util.py b/common/cv_util.py
new file mode 100644
index 0000000..a133e1a
--- /dev/null
+++ b/common/cv_util.py
@@ -0,0 +1,32 @@
+import numpy as np
+import cv2
+
+def get_traj_occupancy(coords, transformer):
+    grid = np.zeros(
+        transformer.shape + (3,), 
+        dtype=np.uint8)
+    traj_pix = transformer.to_grid(coords).astype(np.int32)
+    result = cv2.polylines(
+        grid, [traj_pix[:,::-1]], False, color=(1,1,1))
+    occu = result[:,:,0] > 0
+    return occu
+
+def get_dist_function(img):
+    """
+    Approximation algorithm is used.
+    """
+    non_zero_coords = np.stack(np.nonzero(img)).T
+    dist_img, nn_pix_idx = cv2.distanceTransformWithLabels(
+        src=(~img).astype(np.uint8),
+        distanceType=cv2.DIST_L2, 
+        maskSize=cv2.DIST_MASK_PRECISE, 
+        labelType=cv2.DIST_LABEL_PIXEL)
+    nn_pix_coord = non_zero_coords[nn_pix_idx-1]
+    return dist_img, nn_pix_coord
+
+def get_dist_function_precise(img):
+    dist_img = cv2.distanceTransform(
+        src=(~img).astype(np.uint8), 
+        distanceType=cv2.DIST_L2, 
+        maskSize=cv2.DIST_MASK_PRECISE)
+    return dist_img
diff --git a/common/geometry_util.py b/common/geometry_util.py
new file mode 100644
index 0000000..37d9dd7
--- /dev/null
+++ b/common/geometry_util.py
@@ -0,0 +1,8 @@
+import numpy as np
+
+def homo_transform(mat, points):
+    if mat.shape[1] != points.shape[1]:
+        points = np.concatenate([points, np.ones((points.shape[0],1))], axis=1)
+    homo = points @ mat.T
+    result = (homo[:,:homo.shape[1]-1].T / homo[:,-1]).T
+    return result
diff --git a/common/mujoco_camera_util.py b/common/mujoco_camera_util.py
new file mode 100644
index 0000000..0e16efc
--- /dev/null
+++ b/common/mujoco_camera_util.py
@@ -0,0 +1,88 @@
+"""
+Make sure to run render once before executing following functions, so that 
+offscreen rendering context is initialized
+"""
+
+import math
+from mujoco_py.builder import cymj
+import numpy as np
+from scipy.spatial.transform import Rotation
+
+def get_render_context(sim: cymj.MjSim) -> cymj.MjRenderContext:
+    return sim._render_context_offscreen
+
+def set_camera_pose(sim: cymj.MjSim, 
+    lookat: np.ndarray, distance: float, 
+    azimuth: float, elevation: float):
+    ctx = get_render_context(sim)
+    cam = ctx.cam
+    cam.lookat[:] = lookat
+    cam.distance = distance
+    cam.azimuth = azimuth
+    cam.elevation = elevation
+
+def normalize(v):
+    return v / np.linalg.norm(v)
+
+def lookAt(eye, center, up):
+    ''' Matrix 4x4 lookAt function.'''
+    f = normalize(center - eye)
+    u = normalize(up)
+    s = normalize(np.cross(f, u))
+    u = np.cross(s, f)
+
+    output = [[s[0], u[0], -f[0], 0.0],
+              [s[1], u[1], -f[1], 0.0],
+              [s[2], u[2], -f[2], 0.0],
+              [-s.dot(eye), -u.dot(eye), f.dot(eye), 1.0]]
+    return np.array(output).T
+
+def azel_to_enu(azimuth, elevation):
+    """
+    Mujoco's Azimuth/Elevation is non-standard
+    Def: ray from the camera
+    Azimuth, angle from pos x, counter clockwise
+    Elevation, angle from xy plane
+    """
+    # az_rad = azimuth/180*np.pi
+    xy_rad = azimuth/180*np.pi
+    ele_rad = elevation/180*np.pi
+    z = np.sin(ele_rad)
+    xy_norm = np.cos(ele_rad)
+    x = np.cos(xy_rad) * xy_norm
+    y = np.sin(xy_rad) * xy_norm
+    return np.array([x,y,z])
+
+def get_extrinsic_mat(sim: cymj.MjSim):
+    """
+    Mujoco uses OpenGL convesion for camera coords.
+    -Z front, X right, Y up
+    """
+    ctx = get_render_context(sim)
+    cam = ctx.cam
+    center = np.array(cam.lookat)
+    up = np.array([0,0,1])
+    eye = center - (azel_to_enu(cam.azimuth, cam.elevation) * cam.distance)
+    print(eye)
+    ext_mat = lookAt(eye, center, up)
+    return ext_mat
+    
+def get_intrinsic_mat(sim: cymj.MjSim, width: int, height: int):
+    fovy = sim.model.vis.global_.fovy
+    f = 0.5 * height / math.tan(fovy * math.pi / 360)
+    int_mat = np.array(((f, 0, width / 2), (0, f, height / 2), (0, 0, 1)))
+    return int_mat
+
+def world_to_pixel(points, ext_mat, int_mat):
+    """
+    Output OpenCV keypoint coorindate.
+    Lower left corner origin
+    X right, Y up
+    """
+    gl2cv = np.eye(4)
+    gl2cv[:3,:3] = Rotation.from_rotvec(np.array([np.pi,0,0])).as_matrix()
+    this_ext_mat = gl2cv @ ext_mat
+    cam_points = points @ this_ext_mat[:3,:3].T + this_ext_mat[:3,-1]
+    pix_points = cam_points @ int_mat.T
+    pixs = (pix_points[:,:2].T / pix_points[:,-1]).T
+    return pixs
diff --git a/common/mujoco_util.py b/common/mujoco_util.py
new file mode 100644
index 0000000..32e63ef
--- /dev/null
+++ b/common/mujoco_util.py
@@ -0,0 +1,207 @@
+import numpy as np
+import mujoco_py as mj
+
+class MujocoCompensatedPDController:
+    """
+    Compensated joint-space PD controller
+    """
+    def __init__(self, sim, joint_names, kp=1, kv=1, use_sim_state=True):
+        self.sim = sim
+        self.model = sim.model
+        self.use_sim_state = use_sim_state
+        self.kp = kp
+        self.kv = kv
+
+        sim.forward()
+        model = sim.model
+        # assums 1DoF for all joints
+        joint_ids = [
+            model.joint_name2id(x) 
+            for x in joint_names]
+
+        # compute joint data addrs
+        self.joint_pos_addrs = [model.get_joint_qpos_addr(name) for name in joint_names]
+        self.joint_vel_addrs = [model.get_joint_qvel_addr(name) for name in joint_names]
+
+        joint_pos_addrs = []
+        for elem in self.joint_pos_addrs:
+            if isinstance(elem, tuple):
+                joint_pos_addrs += list(range(elem[0], elem[1]))
+            else:
+                joint_pos_addrs.append(elem)
+        self.joint_pos_addrs = joint_pos_addrs
+
+        joint_vel_addrs = []
+        for elem in self.joint_vel_addrs:
+            if isinstance(elem, tuple):
+                joint_vel_addrs += list(range(elem[0], elem[1]))
+            else:
+                joint_vel_addrs.append(elem)
+        self.joint_vel_addrs = joint_vel_addrs
+
+        # Need to also get the joint rows of the Jacobian, inertia matrix, and
+        # gravity vector. This is trickier because if there's a quaternion in
+        # the joint (e.g. a free joint or a ball joint) then the joint position
+        # address will be different than the joint Jacobian row. This is because
+        # the quaternion joint will have a 4D position and a 3D derivative. So
+        # we go through all the joints, and find out what type they are, then
+        # calculate the Jacobian position based on their order and type.
+        index = 0
+        self.joint_dyn_addrs = []
+        for ii, joint_type in enumerate(model.jnt_type):
+            if ii in joint_ids:
+                self.joint_dyn_addrs.append(index)
+                if joint_type == 0:  # free joint
+                    self.joint_dyn_addrs += [jj + index for jj in range(1, 6)]
+                    index += 6  # derivative has 6 dimensions
+                elif joint_type == 1:  # ball joint
+                    self.joint_dyn_addrs += [jj + index for jj in range(1, 3)]
+                    index += 3  # derivative has 3 dimension
+                else:  # slide or hinge joint
+                    index += 1  # derivative has 1 dimensions
+
+        self.joint_pos_addrs = np.copy(self.joint_pos_addrs)
+        self.joint_vel_addrs = np.copy(self.joint_vel_addrs)
+        self.joint_dyn_addrs = np.copy(self.joint_dyn_addrs)
+
+        # number of controllable joints in the robot arm
+        self.N_JOINTS = len(self.joint_dyn_addrs)
+        # number of joints in the Mujoco simulation
+        N_ALL_JOINTS = self.sim.model.nv
+
+        # need to calculate the joint_dyn_addrs indices in flat vectors returned
+        # for the Jacobian
+        self.jac_indices = np.hstack(
+            # 6 because position and rotation Jacobians are 3 x N_JOINTS
+            [self.joint_dyn_addrs + (ii * N_ALL_JOINTS) for ii in range(3)]
+        )
+
+        # for the inertia matrix
+        self.M_indices = [
+            ii * N_ALL_JOINTS + jj
+            for jj in self.joint_dyn_addrs
+            for ii in self.joint_dyn_addrs
+        ]
+
+        # a place to store data returned from Mujoco
+        self._g = np.zeros(self.N_JOINTS)
+        self._J3NP = np.zeros(3 * N_ALL_JOINTS)
+        self._J3NR = np.zeros(3 * N_ALL_JOINTS)
+        self._J6N = np.zeros((6, self.N_JOINTS))
+        self._MNN_vector = np.zeros(N_ALL_JOINTS ** 2)
+        self._MNN = np.zeros(self.N_JOINTS ** 2)
+        self._R9 = np.zeros(9)
+        self._R = np.zeros((3, 3))
+        self._x = np.ones(4)
+        self.N_ALL_JOINTS = N_ALL_JOINTS
+
+    @property
+    def q(self):
+        return np.copy(self.sim.data.qpos[self.joint_pos_addrs])
+    
+    @property
+    def dq(self):
+        return np.copy(self.sim.data.qvel[self.joint_vel_addrs])
+
+    def generate(self, target, target_velocity=None):
+        if target_velocity is None:
+            target_velocity = np.zeros(self.N_JOINTS)
+
+        q = self.q
+        dq = self.dq
+
+        q_tilde = target - q
+        if False:
+            # calculate the direction for each joint to move, wrapping
+            # around the -pi to pi limits to find the shortest distance
+            q_tilde = ((target - q + np.pi) % (np.pi * 2)) - np.pi
+
+        # get the joint space inertia matrix
+        M = self.M(q)
+        u = np.dot(M, (self.kp * q_tilde + self.kv * (target_velocity - dq)))
+        # account for gravity
+        u -= self.g(q)
+        return u
+    
+    def send_forces(self, u, step=False):
+        # NOTE: the qpos_addr's are unrelated to the order of the motors
+        # NOTE: assuming that the robot arm motors are the first len(u) values
+        self.sim.data.ctrl[:] = u[:]
+
+        # move simulation ahead one time step
+        if step:
+            self.sim.step()
+
+    def g(self, q=None):
+        """Returns qfrc_bias variable, which stores the effects of Coriolis,
+        centrifugal, and gravitational forces
+
+        Parameters
+        ----------
+        q: float numpy.array, optional (Default: None)
+            The joint angles of the robot. If None the current state is
+            retrieved from the Mujoco simulator
+        """
+        # TODO: For the Coriolis and centrifugal functions, setting the
+        # velocity before calculation is important, how best to do this?
+        if not self.use_sim_state and q is not None:
+            old_q, old_dq, old_u = self._load_state(q)
+
+        g = -1 * self.sim.data.qfrc_bias[self.joint_dyn_addrs]
+
+        if not self.use_sim_state and q is not None:
+            self._load_state(old_q, old_dq, old_u)
+        return g
+    
+    def M(self, q=None):
+        """Returns the inertia matrix in task space
+
+        Parameters
+        ----------
+        q: float numpy.array, optional (Default: None)
+            The joint angles of the robot. If None the current state is
+            retrieved from the Mujoco simulator
+        """
+        if not self.use_sim_state and q is not None:
+            old_q, old_dq, old_u = self._load_state(q)
+
+        # stored in mjData.qM, stored in custom sparse format,
+        # convert qM to a dense matrix with mj_fullM
+        mj.cymj._mj_fullM(self.model, self._MNN_vector, self.sim.data.qM)
+        M = self._MNN_vector[self.M_indices]
+        M = M.reshape((self.N_JOINTS, self.N_JOINTS))
+
+        if not self.use_sim_state and q is not None:
+            self._load_state(old_q, old_dq, old_u)
+
+        return np.copy(M)
+
+    def _load_state(self, q=None, dq=None, u=None):
+        """Change the current joint angles
+
+        Parameters
+        ----------
+        q: np.array
+            The set of joint angles to move the arm to [rad]
+        dq: np.array
+            The set of joint velocities to move the arm to [rad/sec]
+        u: np.array
+            The set of joint forces to apply to the arm joints [Nm]
+        """
+        # save current state
+        old_q = np.copy(self.sim.data.qpos[self.joint_pos_addrs])
+        old_dq = np.copy(self.sim.data.qvel[self.joint_vel_addrs])
+        old_u = np.copy(self.sim.data.ctrl)
+
+        # update positions to specified state
+        if q is not None:
+            self.sim.data.qpos[self.joint_pos_addrs] = np.copy(q)
+        if dq is not None:
+            self.sim.data.qvel[self.joint_vel_addrs] = np.copy(dq)
+        if u is not None:
+            self.sim.data.ctrl[:] = np.copy(u)
+
+        # move simulation forward to calculate new kinamtic information
+        self.sim.forward()
+
+        return old_q, old_dq, old_u
diff --git a/common/sample_util.py b/common/sample_util.py
new file mode 100644
index 0000000..9df4b65
--- /dev/null
+++ b/common/sample_util.py
@@ -0,0 +1,311 @@
+from typing import Sequence, Tuple, Optional
+
+import numpy as np
+import zarr
+import numcodecs as nc
+import scipy.interpolate as si
+
+
+def exp_range(start, stop, num):
+    start_log = np.log(start)
+    stop_log = np.log(stop)
+    log_samples = np.linspace(start_log, stop_log, num)
+    samples = np.exp(log_samples)
+    return samples
+
+def poly_range(start, stop, num, deg=2):
+    start_p = np.power(start, 1/deg)
+    end_p = np.power(stop, 1/deg)
+    p_samples = np.linspace(start_p, end_p, num)
+    samples = np.power(p_samples, deg)
+    return samples
+
+
+def get_nd_index_volume(shape):
+    index_volume = np.moveaxis(np.stack(np.meshgrid(
+            *(np.arange(x) for x in shape)
+            , indexing='ij')), 0, -1)
+    return index_volume
+
+
+def get_grid_samples(*dim_samples):
+    arrs = np.meshgrid(*dim_samples, indexing='ij')
+    samples_volume = np.stack(arrs, axis=-1)
+    return samples_volume
+
+
+def get_flat_idx_samples(samples_volume, sample_dims=1):
+    dim_idxs = [np.arange(x) for x in samples_volume.shape[:-sample_dims]]
+    dim_idxs_volume = get_grid_samples(*dim_idxs)
+    idxs_flat = dim_idxs_volume.reshape(-1, dim_idxs_volume.shape[-1])
+    samples_flat = samples_volume.reshape(-1, *samples_volume.shape[-sample_dims:])
+    return idxs_flat, samples_flat
+
+
+class VirtualSampleGrid:
+    default_data_name = 'data'
+
+    def __init__(self, 
+            axes_samples: Sequence[Tuple[str, np.array]],
+            zarr_group: zarr.Group,
+            compressor: Optional[nc.abc.Codec] = None):
+        dim_keys = list()
+        dim_samples = list()
+        for key, samples in axes_samples:
+            dim_keys.append(key)
+            dim_samples.append(samples)
+        # check uniqueness
+        assert(len(set(dim_keys)) == len(dim_keys))
+
+        if compressor is None:
+            compressor = self.get_default_compressor()
+
+        self.dim_keys = dim_keys
+        self.dim_samples = dim_samples
+
+        self.zarr_group = zarr_group
+        self.compressor = compressor
+    
+    @classmethod
+    def from_zarr_group(cls, zarr_group):
+        dim_keys = zarr_group['dim_keys'][:].tolist()
+        dim_samples_group = zarr_group['dim_samples']
+        dim_samples = list()
+        for key in dim_keys:
+            dim_samples.append(dim_samples_group[key][:])
+        
+        compressor = None
+        if cls.default_data_name in zarr_group:
+            compressor = zarr_group['data'].compressor
+        
+        sample_grid = cls(
+            zip(dim_keys, dim_samples), 
+            zarr_group=zarr_group,
+            compressor=compressor)
+        return sample_grid
+    
+    @staticmethod
+    def get_default_compressor():
+        return nc.Blosc(
+                cname='zstd', clevel=6, 
+                shuffle=nc.Blosc.BITSHUFFLE)
+    
+    def get_sample(self, index_tuple):
+        assert(len(index_tuple) == len(self.dim_samples))
+        return tuple(self.dim_samples[i][j] 
+            for i, j in enumerate(index_tuple))
+    
+    def get_idxs_volume(self, dim_ids=None):
+        if dim_ids is None:
+            dim_ids = list(range(len(self.shape)))
+        dim_idxs = [np.arange(len(self.dim_samples[i])) 
+            for i in dim_ids]
+        dim_idxs_volume = get_grid_samples(*dim_idxs)
+        return dim_idxs_volume
+    
+    def get_idxs_flat(self, dim_ids=None):
+        dim_idxs_volume = self.get_idxs_volume(dim_ids=dim_ids)
+        idxs_flat = dim_idxs_volume.reshape(-1, dim_idxs_volume.shape[-1])
+        return idxs_flat
+
+    def write_axes(self):
+        dim_keys = self.dim_keys
+        dim_samples = self.dim_samples
+        zarr_group = self.zarr_group
+        compressor = self.compressor
+
+        dim_samples_group = zarr_group.require_group(
+            'dim_samples', overwrite=False)
+        for key, value in zip(dim_keys, dim_samples):
+            dim_samples_group.array(
+                name=key, data=value, chunks=value.shape, 
+                compressor=compressor, overwrite=True)
+        zarr_group.array(name='dim_keys', data=dim_keys, 
+            compressor=compressor, overwrite=True)
+    
+    def allocate_data(self, 
+            name=None, 
+            grid_shape=None, data_shape=tuple(), dtype=np.float32, 
+            fill_value=np.nan, overwrite=False, compressor=None):
+        zarr_group = self.zarr_group
+        if compressor is None:
+            compressor = self.compressor
+        if name is None:
+            name = self.default_data_name
+        if grid_shape is None:
+            grid_shape = self.shape
+
+        data_array = zarr_group.require_dataset(
+            name=name, dtype=dtype, compressor=compressor,
+            shape=grid_shape + data_shape, 
+            chunks=(1,) * len(grid_shape) + data_shape,
+            fill_value=fill_value,
+            overwrite=overwrite)
+        return data_array
+    
+    @property
+    def shape(self):
+        return tuple(len(x) for x in self.dim_samples)
+
+
+class NdGridInterpolator:
+    def __init__(self, 
+            grid: np.array, 
+            dim_samples: Sequence[np.array],
+            seed: int = 0):
+        assert(grid.shape == tuple(len(x) for x in dim_samples))
+        grid_coords = np.vstack(np.nonzero(grid)).T
+        dim_interpolators = [si.interp1d(
+            x=np.arange(len(y)),
+            y=y, fill_value='extrapolate')
+            for y in dim_samples]
+        rs = np.random.RandomState(seed=seed)
+
+        self.grid_coords = grid_coords
+        self.dim_interpolators = dim_interpolators
+        self.rs = rs
+    
+    def get_coord_sample(self, size: int) -> np.ndarray:
+        grid_coords = self.grid_coords
+        rs = self.rs
+        coord_idx_sample = rs.choice(len(grid_coords), size=size)
+        diffuse_sample = rs.uniform(0, 1, 
+            size=(size, grid_coords.shape[-1]))
+        coord_sample = grid_coords[coord_idx_sample] + diffuse_sample
+        return coord_sample
+        
+    def get_sample(self, size: int) -> np.ndarray:
+        dim_interpolators = self.dim_interpolators
+        
+        coord_sample = self.get_coord_sample(size)
+        sample = np.zeros_like(coord_sample)
+        for i in range(coord_sample.shape[-1]):
+            sample[:,i] = dim_interpolators[i](coord_sample[:,i])
+        return sample
+
+
+class GridCoordTransformer:
+    def __init__(self, 
+        low: tuple, high:tuple,
+        grid_shape: tuple):
+        self.kwargs = {
+            'low': low,
+            'high': high,
+            'grid_shape': grid_shape
+        }
+
+        low = np.array(low)
+        high = np.array(high)
+        grid_shape = np.array(grid_shape)
+
+        # pixel per meter
+        scale = grid_shape / (high - low)
+
+        # offset + real = pixel origin
+        offset = - low
+    
+        self.scale = scale
+        self.offset = offset
+        self.grid_shape = grid_shape
+        
+    def to_zarr(self, zarr_path):
+        root = zarr.open(zarr_path, 'rw')
+        attrs = root.attrs.asdict()
+        attrs['transformer'] = self.kwargs
+        root.attrs.put(attrs)
+
+    @classmethod
+    def from_zarr(cls, zarr_path):
+        root = zarr.open(zarr_path, 'r')
+        attrs = root.attrs.asdict()
+        kwargs = attrs['transformer']
+        return cls(**kwargs)
+    
+    @property
+    def shape(self):
+        return tuple(self.grid_shape.tolist())
+    
+    @property
+    def pix_per_m(self):
+        return np.mean(self.scale)
+
+    def to_grid(self, coords, clip=True):
+        offset = self.offset
+        scale = self.scale
+        grid_shape = self.grid_shape
+
+        result = (coords + offset) * scale
+        if clip:
+            result = np.clip(
+                result, a_min=(0,0), a_max=grid_shape)
+        return result
+    
+    def from_grid(self, coords):
+        offset = self.offset
+        scale = self.scale
+
+        result = (coords / scale) - offset
+        return result
+
+
+def ceil_div(a, b):
+    return -(-a // b)
+
+class ArraySlicer:
+    def __init__(self, shape: tuple, chunks: tuple):
+        assert(len(chunks) <= len(shape))
+        relevent_shape = shape[:len(chunks)]
+        chunk_size = tuple(ceil_div(*x) \
+            for x in zip(relevent_shape, chunks))
+        
+        self.relevent_shape = relevent_shape
+        self.chunks = chunks
+        self.chunk_size = chunk_size
+    
+    def __len__(self):
+        chunk_size = self.chunk_size
+        return int(np.prod(chunk_size))
+    
+    def __getitem__(self, idx):
+        relevent_shape = self.relevent_shape
+        chunks = self.chunks
+        chunk_size = self.chunk_size
+        chunk_stride = np.cumprod((chunk_size[1:] + (1,))[::-1])[::-1]
+        chunk_idx = list()
+        mod = idx
+        for x in chunk_stride:
+            chunk_idx.append(mod // x)
+            mod = mod % x
+
+        slices = list()
+        for i in range(len(chunk_idx)):
+            start = chunks[i] * chunk_idx[i]
+            end = min(relevent_shape[i], 
+                chunks[i] * (chunk_idx[i] + 1))
+            slices.append(slice(start, end))
+        return slices
+
+    def __iter__(self):
+        for i in range(len(self)):
+            yield self[i]
+
+
+def transpose_data_dict(data):
+    assert(len(data) > 0)
+    result = dict()
+    for key in data[0].keys():
+        if isinstance(data[0][key], list):
+            result[key] = [x[key] for x in data]
+        elif isinstance(data[0][key], np.ndarray):
+            same_shape = True
+            shape = data[0][key].shape
+            for x in data:
+                if x[key].shape != shape:
+                    same_shape = False
+            if same_shape:
+                result[key] = np.array([x[key] for x in data])
+            else:
+                result[key] = [x[key] for x in data]
+        else:
+            result[key] = np.array([x[key] for x in data])
+    return result
diff --git a/common/template_util.py b/common/template_util.py
new file mode 100644
index 0000000..174476b
--- /dev/null
+++ b/common/template_util.py
@@ -0,0 +1,26 @@
+import os
+import hashlib
+import pickle
+
+import jinja2
+
+
+def hash_obj(obj):
+    m = hashlib.md5()
+    m.update(pickle.dumps(obj))
+    result = m.hexdigest()
+    return result
+
+
+def require_xml(folder, param_dict, template_path, force=False) -> str:
+    fname = hash_obj(param_dict) + '.xml'
+    path = os.path.join(folder, fname)
+    if (not force) and os.path.isfile(path):
+        return fname
+    
+    template = jinja2.Template(open(template_path, 'r').read())
+    xml_text = template.render(**param_dict)
+    with open(path, 'w') as f:
+        f.write(xml_text)
+
+    return fname
diff --git a/common/torch_util.py b/common/torch_util.py
new file mode 100644
index 0000000..cb4c395
--- /dev/null
+++ b/common/torch_util.py
@@ -0,0 +1,60 @@
+import torch
+import numpy as np
+
+
+def to_numpy(x):
+    return x.detach().to('cpu').numpy()
+
+
+def dict_to(x, device):
+    new_x = dict()
+    for key, value in x.items():
+        if isinstance(value, torch.Tensor):
+            value = value.to(device)
+        new_x[key] = value
+    return new_x
+
+
+def explode_shape(input, shape, dim=-1):
+    """
+    Reshape a 1D tensor such that it matches the shape except
+    dim, index into shape
+    """
+    assert(isinstance(input, torch.Tensor))
+    if isinstance(dim, int):
+        dim = [dim]
+    assert(len(input.shape) == len(dim))
+    single_shape = np.ones(len(shape), dtype=np.int64)
+    single_shape[dim] = input.shape
+    single = input.view(*tuple(single_shape.tolist()))
+
+    result_shape = np.array(shape, dtype=np.int64)
+    result_shape[dim] = -1
+    result = single.expand(*tuple(result_shape.tolist()))
+    return result
+
+
+def sort_select(input, *attr_tensors, 
+        dim=-1, dim_slice=slice(None), 
+        sort_descending=True):
+    """
+    Sort attributes according to input's value
+    on dim. All needs to have the same number of dims
+    before and at dim
+    """
+    last_dim = np.arange(len(input.shape))[dim] + 1
+    for attr_tensor in attr_tensors:
+        assert(attr_tensor.shape[:last_dim] == input.shape[:last_dim])
+    selector = (slice(None),) * (last_dim - 1) + (dim_slice,)
+
+    sorted_input, sorted_idxs = torch.sort(
+        input, dim=dim, descending=sort_descending)
+
+    result = [sorted_input[selector]]
+    for attr_tensor in attr_tensors:
+        selected_attr_tensor = torch.gather(
+            attr_tensor, dim=dim, 
+            index=sorted_idxs.expand_as(
+                attr_tensor))[selector]
+        result.append(selected_attr_tensor)
+    return tuple(result)
diff --git a/common/trapezoid.py b/common/trapezoid.py
new file mode 100644
index 0000000..e97a65b
--- /dev/null
+++ b/common/trapezoid.py
@@ -0,0 +1,81 @@
+import numpy as np
+
+
+def get_trapezoid_phase_profile(
+        dt=0.01,
+        start_phase=0.0,
+        end_phase=1.0,
+        speed=1.0, 
+        acceleration=1.0, 
+        start_padding=0.0, 
+        end_padding=0.0,
+        dtype=np.float64):
+    
+    # calculate duration
+    assert(end_phase > start_phase)
+    total_travel = end_phase - start_phase
+
+    t_cruise = None
+    t_acc = None
+    max_speed = None
+    tri_max_speed = np.sqrt(acceleration * total_travel)
+    if tri_max_speed <= speed:
+        # triangle
+        t_acc = total_travel / tri_max_speed
+        t_cruise = 0
+        max_speed = tri_max_speed
+    else:
+        # trapozoid
+        t_acc = speed / acceleration
+        tri_travel = t_acc * speed
+        t_cruise = (total_travel - tri_travel) / speed
+        max_speed = speed
+
+    duration = start_padding + end_padding + t_acc * 2 + t_cruise
+    key_point_diff_arr = np.array([
+        start_padding, t_acc, t_cruise, t_acc, end_padding], dtype=dtype)
+    key_point_time_arr = np.cumsum(key_point_diff_arr)
+
+    all_time_steps = np.linspace(0, duration, int(np.ceil(duration / dt)), dtype=dtype)
+    phase_steps = np.zeros_like(all_time_steps)
+
+    # start_padding
+    mask_idxs = np.flatnonzero(all_time_steps < key_point_time_arr[0])
+    if len(mask_idxs) > 0:
+        phase_steps[mask_idxs] = start_phase
+    
+    # acceleartion
+    mask_idxs = np.flatnonzero(
+        (key_point_time_arr[0] <= all_time_steps) 
+        & (all_time_steps <= key_point_time_arr[1]))
+    if len(mask_idxs) > 0:
+        phase_steps[mask_idxs] = acceleration / 2 * np.square(
+            all_time_steps[mask_idxs] - key_point_time_arr[0])
+    acc_dist = acceleration / 2 * (t_acc ** 2)
+    
+    # cruise
+    mask_idxs = np.flatnonzero(
+        (key_point_time_arr[1] < all_time_steps) 
+        & (all_time_steps < key_point_time_arr[2]))
+    if len(mask_idxs) > 0:
+        phase_steps[mask_idxs] = max_speed * (
+            all_time_steps[mask_idxs] - key_point_time_arr[1]) + acc_dist
+    cruise_dist = t_cruise * max_speed
+
+    # deceleration
+    mask_idxs = np.flatnonzero(
+        (key_point_time_arr[2] <= all_time_steps) 
+        & (all_time_steps <= key_point_time_arr[3]))
+    if len(mask_idxs) > 0:
+        curr_time_steps = all_time_steps[mask_idxs] - key_point_time_arr[2]
+        phase_steps[mask_idxs] = max_speed * curr_time_steps \
+            - acceleration / 2 * np.square(curr_time_steps) \
+            + acc_dist + cruise_dist
+    
+    # end_padding
+    int_end_phase = acc_dist * 2 + cruise_dist
+    mask_idxs = np.flatnonzero(key_point_time_arr[3] <= all_time_steps)
+    if len(mask_idxs) > 0:
+        phase_steps[mask_idxs] = int_end_phase
+        
+    return phase_steps
\ No newline at end of file
diff --git a/common/urscript_control_util.py b/common/urscript_control_util.py
new file mode 100644
index 0000000..4ad6977
--- /dev/null
+++ b/common/urscript_control_util.py
@@ -0,0 +1,30 @@
+import numpy as np
+from scipy.interpolate import interp1d
+from common.trapezoid import get_trapezoid_phase_profile
+
+
+def get_movej_trajectory(j_start, j_end, acceleration, speed, dt=0.001):
+    assert(acceleration > 0)
+    assert(speed > 0)
+
+    j_delta = j_end - j_start
+    j_delta_abs = np.abs(j_delta)
+    j_delta_max = np.max(j_delta_abs)
+
+    if j_delta_max != 0:
+        # compute phase parameters
+        phase_vel = speed / j_delta_max
+        phase_acc = acceleration * (phase_vel)
+
+        phase = get_trapezoid_phase_profile(dt=dt,
+            speed=phase_vel, acceleration=phase_acc)
+        
+        interp = interp1d([0,1], [j_start, j_end], 
+            axis=0, fill_value='extrapolate')
+        j_traj = interp(phase)
+    else:
+        j_traj = np.array([j_start, j_end])
+
+    assert(np.allclose(j_traj[0], j_start))
+    assert(np.allclose(j_traj[-1], j_end))
+    return j_traj
diff --git a/common/wandb_util.py b/common/wandb_util.py
new file mode 100644
index 0000000..aeb17ad
--- /dev/null
+++ b/common/wandb_util.py
@@ -0,0 +1,100 @@
+import os
+import pathlib
+
+import numpy as np
+import pandas as pd
+import scipy.stats as st
+import wandb
+
+def get_checkpoint_df(checkpoint_dir):
+    all_checkpoint_paths = sorted(pathlib.Path(checkpoint_dir).glob('*.ckpt'))
+    rows = list()
+    for path in all_checkpoint_paths:
+        fname = path.stem
+        row = dict()
+        split_result = fname.split('-')
+        if len(split_result) != 2:
+            continue
+        for item in split_result:
+            key, value = item.split('=')
+            row[key] = float(value)
+        row['path'] = str(path.absolute())
+        rows.append(row)
+    checkpoint_df = pd.DataFrame(rows)
+    return checkpoint_df
+
+
+def get_best_checkpoint(output_dir):
+    checkpoint_dir = os.path.join(output_dir, 'checkpoints')
+    checkpoint_df = get_checkpoint_df(checkpoint_dir)
+    checkpoint_path = checkpoint_df.loc[checkpoint_df.val_loss.idxmin()].path
+    return checkpoint_path
+
+
+def get_error_plots_log(key, errors):
+    k_min_error = np.minimum.accumulate(errors, axis=-1)
+    early_stop_error = errors.copy()
+    for i in range(len(early_stop_error)):
+        curr_min = early_stop_error[i,0]
+        for j in range(len(early_stop_error[i])):
+            this = early_stop_error[i,j]
+            if curr_min < 0.02:
+                early_stop_error[i,j] = curr_min
+            else:
+                curr_min = min(curr_min, this)
+
+    rope_log_df = pd.DataFrame({
+        'step': np.arange(errors.shape[-1]),
+        'error': np.mean(errors, axis=0)*100,
+        'error_sem': st.sem(errors, axis=0)*100,
+        'median_error': np.median(errors, axis=0)*100,
+        'k_min_error': np.mean(k_min_error, axis=0)*100,
+        'early_stop_error': np.mean(early_stop_error, axis=0)*100,
+        'early_stop_error_sem': st.sem(early_stop_error, axis=0)*100
+    })
+
+    table = wandb.Table(dataframe=rope_log_df)
+    log = {
+        key+'/table': table,
+        key+'/error': wandb.plot.line(
+            table, 'step', 'error', title="error(cm)"),
+        key+'/median_error': wandb.plot.line(
+            table, 'step', 'median_error', title="median error(cm)"),
+        key+'/k_min_error': wandb.plot.line(
+            table, 'step', 'k_min_error', title="k-min error(cm)"),
+        key+'/early_stop_error': wandb.plot.line(
+            table, 'step', 'early_stop_error', title="early stop error(cm)")
+    }
+    return log
+
+def get_raw_runs_df(api, path):
+    runs_iter = api.runs(path=path)
+    rows = list()
+    for run in runs_iter:
+        row = {
+            'name': run.name,
+            'id': run.id,
+            'tags': run.tags,
+            'state': run.state,
+            'created_at': pd.Timestamp(run.created_at),
+            'heartbeat_at': pd.Timestamp(run.heartbeat_at),
+            'notes': run.notes,
+            'summary': run.summary._json_dict,
+            'config': run.config,
+            'run': run
+        }
+        rows.append(row)
+    runs_df = pd.DataFrame(rows)
+    return runs_df
+
+def get_row_tag(
+        row, 
+        keys=['Dress', 'Jumpsuit', 'Skirt', 'Top', 'Trousers', 'Tshirt']):
+    tags = set(row.tags)
+    for key in keys:
+        if key in tags:
+            return key
+    return None
+
+def get_row_output_dir(row):
+    return row['config']['output_dir']
diff --git a/common/zarr_util.py b/common/zarr_util.py
new file mode 100644
index 0000000..9d7356a
--- /dev/null
+++ b/common/zarr_util.py
@@ -0,0 +1,188 @@
+import os
+
+import zarr
+import numpy as np
+
+from common.sample_util import get_nd_index_volume
+
+
+def get_hybrid_coordinate_selection(
+        array: zarr.Array, 
+        indices: np.ndarray, 
+        index_dims: tuple):
+    """
+    Example
+    array: (50,50,50,10,256,256)
+    indices: (256,256,10,5)
+    index_dims: (0,1,2,-2,-1)
+    return: (256,256,10,10)
+    """
+    assert(indices.shape[-1] == len(index_dims))
+    flat_indices = indices.reshape(-1, indices.shape[-1])
+
+    array_shape = np.array(array.shape)
+    dim_is_index = np.zeros(len(array.shape), dtype=bool)
+    dim_is_index[index_dims] = True
+
+    item_shape = array_shape[~dim_is_index]
+    item_size = np.prod(item_shape)
+    
+    item_coords = get_nd_index_volume(item_shape)
+    num_coords = len(flat_indices) * item_size
+
+    coords = np.zeros(
+        (len(array.shape), num_coords), 
+        dtype=np.uint64)
+    coords[dim_is_index] = np.repeat(
+        flat_indices, item_size, axis=0).T
+    coords[~dim_is_index] = np.tile(
+        item_coords, (flat_indices.shape[0],1)).T
+    
+    # get data
+    data = array.get_coordinate_selection(tuple(coords))
+
+    # reshape result
+    result = data.reshape(
+        tuple(indices.shape[:-1]) + tuple(item_shape))
+    
+    return result
+
+
+def get_initialized_chunk_coords(
+        array: zarr.Array, sort=True):
+    n_dims = len(array.shape)
+    array_dir = os.path.join(array.store.path, array.path)
+    fnames = os.listdir(array_dir)
+
+    coords_list = list()
+    for fname in fnames:
+        this_strs = fname.split('.')
+        if len(this_strs) == n_dims:
+            coords_list.append([int(x) for x in this_strs])
+    coords = np.array(coords_list, dtype=np.int64)
+    if sort:
+        sorted_idxs = np.lexsort(coords.T[::-1], axis=-1)
+        coords = coords[sorted_idxs]
+    return coords
+
+
+def get_is_initialized_volume(
+        array: zarr.Array, dims=None):
+    if dims is None:
+        dims = np.arange(len(array.shape))[
+            np.array(array.chunks) == 1]
+    
+    coords = get_initialized_chunk_coords(
+        array=array, sort=False)
+    selected_shape = tuple(np.array(array.shape)[dims])
+    if len(coords) == 0:
+        return np.zeros(selected_shape, dtype=bool)
+
+    selected_coords = coords[:,dims]
+    is_initialized_volume = np.zeros(selected_shape, dtype=bool)
+    is_initialized_volume[tuple(selected_coords.T)] = True
+    return is_initialized_volume
+
+
+def parse_bytes(s):
+    """Parse byte string to numbers
+
+    >>> from dask.utils import parse_bytes
+    >>> parse_bytes('100')
+    100
+    >>> parse_bytes('100 MB')
+    100000000
+    >>> parse_bytes('100M')
+    100000000
+    >>> parse_bytes('5kB')
+    5000
+    >>> parse_bytes('5.4 kB')
+    5400
+    >>> parse_bytes('1kiB')
+    1024
+    >>> parse_bytes('1e6')
+    1000000
+    >>> parse_bytes('1e6 kB')
+    1000000000
+    >>> parse_bytes('MB')
+    1000000
+    >>> parse_bytes(123)
+    123
+    >>> parse_bytes('5 foos')
+    Traceback (most recent call last):
+        ...
+    ValueError: Could not interpret 'foos' as a byte unit
+    """
+    if isinstance(s, (int, float)):
+        return int(s)
+    s = s.replace(" ", "")
+    if not any(char.isdigit() for char in s):
+        s = "1" + s
+
+    for i in range(len(s) - 1, -1, -1):
+        if not s[i].isalpha():
+            break
+    index = i + 1
+
+    prefix = s[:index]
+    suffix = s[index:]
+
+    try:
+        n = float(prefix)
+    except ValueError as e:
+        raise ValueError("Could not interpret '%s' as a number" % prefix) from e
+
+    try:
+        multiplier = byte_sizes[suffix.lower()]
+    except KeyError as e:
+        raise ValueError("Could not interpret '%s' as a byte unit" % suffix) from e
+
+    result = n * multiplier
+    return int(result)
+
+
+byte_sizes = {
+    "kB": 10 ** 3,
+    "MB": 10 ** 6,
+    "GB": 10 ** 9,
+    "TB": 10 ** 12,
+    "PB": 10 ** 15,
+    "KiB": 2 ** 10,
+    "MiB": 2 ** 20,
+    "GiB": 2 ** 30,
+    "TiB": 2 ** 40,
+    "PiB": 2 ** 50,
+    "B": 1,
+    "": 1,
+}
+byte_sizes = {k.lower(): v for k, v in byte_sizes.items()}
+byte_sizes.update({k[0]: v for k, v in byte_sizes.items() if k and "i" not in k})
+byte_sizes.update({k[:-1]: v for k, v in byte_sizes.items() if k and "i" in k})
+
+
+def open_cached(zarr_path, mode='a', cache_size='4GB', **kwargs):
+    cache_bytes = 0
+    if cache_size:
+        cache_bytes = parse_bytes(cache_size)
+    
+    store = zarr.DirectoryStore(zarr_path)
+    chunk_store = store
+    if cache_bytes > 0:
+        chunk_store = zarr.LRUStoreCache(store, max_size=cache_bytes)
+    
+    group = zarr.open_group(store=store, mode=mode, chunk_store=chunk_store, **kwargs)
+    return group
+
+
+def require_parent_group(group, name, overwrite=False):
+    parent_name = os.path.dirname(name)
+    if parent_name == '':
+        return group
+    if parent_name in group:
+        pg = group[parent_name]
+        if isinstance(pg,zarr.hierarchy.Group):
+            return pg
+        raise RuntimeError('Group {} already exists!'.format(parent_name))
+    pg = require_parent_group(group, parent_name, overwrite=overwrite)
+    result = pg.require_group(os.path.basename(name), overwrite=overwrite)
+    return result
diff --git a/components/deeplab_v3_plus.py b/components/deeplab_v3_plus.py
new file mode 100644
index 0000000..ac1405c
--- /dev/null
+++ b/components/deeplab_v3_plus.py
@@ -0,0 +1,252 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from components.resnet import ResNet50, ResNet101
+
+
+def get_1x_lr_params(model):
+    """
+    This generator returns all the parameters of the net except for
+    the last classification layer. Note that for each batchnorm layer,
+    requires_grad is set to False in deeplab_resnet.py, therefore this function does not return
+    any batchnorm parameter
+    """
+    b = [model.resnet_features]
+    for i in range(len(b)):
+        for k in b[i].parameters():
+            if k.requires_grad:
+                yield k
+
+
+def get_10x_lr_params(model):
+    """
+    This generator returns all the parameters for the last layer of the net,
+    which does the classification of pixel into classes
+    """
+    b = [model.aspp1, model.aspp2, model.aspp3, model.aspp4, model.conv1, model.conv2, model.last_conv]
+    for j in range(len(b)):
+        for k in b[j].parameters():
+            if k.requires_grad:
+                yield k
+
+
+class ASPP_module(nn.Module):
+    def __init__(self, inplanes, planes, rate):
+        super(ASPP_module, self).__init__()
+        if rate == 1:
+            kernel_size = 1
+            padding = 0
+        else:
+            kernel_size = 3
+            padding = rate
+        self.atrous_convolution = nn.Conv2d(inplanes, planes, kernel_size=kernel_size,
+                                            stride=1, padding=padding, dilation=rate, bias=False)
+        self.bn = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU()
+
+        self._init_weight()
+
+    def forward(self, x):
+        x = self.atrous_convolution(x)
+        x = self.bn(x)
+
+        return self.relu(x)
+
+    def _init_weight(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                torch.nn.init.kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+
+class DeepLabv3_plus(nn.Module):
+    def __init__(self, nInputChannels=3, n_classes=21, os=16):
+        super().__init__()
+
+        # Atrous Conv
+        self.resnet_features = ResNet101(nInputChannels, os)
+
+        # ASPP
+        if os == 16:
+            rates = [1, 6, 12, 18]
+        elif os == 8:
+            rates = [1, 12, 24, 36]
+        else:
+            raise NotImplementedError
+
+        self.aspp1 = ASPP_module(2048, 256, rate=rates[0])
+        self.aspp2 = ASPP_module(2048, 256, rate=rates[1])
+        self.aspp3 = ASPP_module(2048, 256, rate=rates[2])
+        self.aspp4 = ASPP_module(2048, 256, rate=rates[3])
+
+        self.relu = nn.ReLU()
+
+        self.global_avg_pool = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
+                                             nn.Conv2d(2048, 256, 1, stride=1, bias=False),
+                                             nn.BatchNorm2d(256),
+                                             nn.ReLU())
+
+        self.conv1 = nn.Conv2d(1280, 256, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(256)
+
+        # adopt [1x1, 48] for channel reduction.
+        self.conv2 = nn.Conv2d(256, 48, 1, bias=False)
+        self.bn2 = nn.BatchNorm2d(48)
+
+        self.last_conv = nn.Sequential(nn.Conv2d(304, 256, kernel_size=3, stride=1, padding=1, bias=False),
+                                       nn.BatchNorm2d(256),
+                                       nn.ReLU(),
+                                       nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1, bias=False),
+                                       nn.BatchNorm2d(256),
+                                       nn.ReLU(),
+                                       nn.Conv2d(256, n_classes, kernel_size=1, stride=1))
+
+    def forward(self, input):
+        x, low_level_features = self.resnet_features(input)
+        x1 = self.aspp1(x)
+        x2 = self.aspp2(x)
+        x3 = self.aspp3(x)
+        x4 = self.aspp4(x)
+        x5 = self.global_avg_pool(x)
+        x5 = F.interpolate(x5, size=x4.size()[2:], mode='bilinear', align_corners=True)
+
+        x = torch.cat((x1, x2, x3, x4, x5), dim=1)
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = F.interpolate(x, size=(int(math.ceil(input.size()[-2]/4)),
+                                int(math.ceil(input.size()[-1]/4))), mode='bilinear', align_corners=True)
+
+        low_level_features = self.conv2(low_level_features)
+        low_level_features = self.bn2(low_level_features)
+        low_level_features = self.relu(low_level_features)
+
+
+        x = torch.cat((x, low_level_features), dim=1)
+        x = self.last_conv(x)
+        x = F.interpolate(x, size=input.size()[2:], mode='bilinear', align_corners=True)
+
+        return x
+
+    def freeze_bn(self):
+        for m in self.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eval()
+
+    def __init_weight(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                torch.nn.init.kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+
+class DeepLabv3_feature(nn.Module):
+    def __init__(self, n_channels=3, n_features=64, os=8):
+        super().__init__()
+
+        # Atrous Conv
+        self.resnet_features = ResNet50(n_channels, os)
+
+        # ASPP
+        if os == 16:
+            rates = [1, 6, 12, 18]
+        elif os == 8:
+            rates = [1, 12, 24, 36]
+        else:
+            raise NotImplementedError
+
+        self.aspp1 = ASPP_module(2048, 256, rate=rates[0])
+        self.aspp2 = ASPP_module(2048, 256, rate=rates[1])
+        self.aspp3 = ASPP_module(2048, 256, rate=rates[2])
+        self.aspp4 = ASPP_module(2048, 256, rate=rates[3])
+
+        self.relu = nn.ReLU()
+
+        self.global_avg_pool = nn.Sequential(
+            nn.AdaptiveAvgPool2d((1, 1)),
+            nn.Conv2d(2048, 256, 1, stride=1, bias=False),
+            nn.BatchNorm2d(256),
+            nn.ReLU())
+
+        self.conv1 = nn.Conv2d(1280, 256, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(256)
+
+        # adopt [1x1, 48] for channel reduction.
+        # self.conv2 = nn.Conv2d(256, 48, 1, bias=False)
+        # self.bn2 = nn.BatchNorm2d(48)
+
+        self.conv_4 = nn.Sequential(
+            nn.Conv2d(256, 48, 1, bias=False),
+            nn.BatchNorm2d(48),
+            nn.ReLU()
+        )
+
+        self.last_conv = nn.Sequential(
+            nn.Conv2d(304, n_features, kernel_size=3, stride=1, padding=1, bias=False),
+            nn.BatchNorm2d(n_features),
+            nn.ReLU(),
+            nn.Conv2d(n_features, n_features, kernel_size=3, stride=1, padding=1, bias=False),
+            nn.BatchNorm2d(n_features),
+            nn.ReLU(),
+            nn.Conv2d(n_features, n_features, kernel_size=1, stride=1))
+
+    def forward(self, input):
+        resnet_result = self.resnet_features(input)
+        x = resnet_result['out']
+        feature_4 = resnet_result['feature_4']
+        feature_2 = resnet_result['feature_2']
+        x1 = self.aspp1(x)
+        x2 = self.aspp2(x)
+        x3 = self.aspp3(x)
+        x4 = self.aspp4(x)
+        x5 = self.global_avg_pool(x)
+        x5 = F.interpolate(x5, size=x4.size()[2:], mode='bilinear', align_corners=True)
+
+        x = torch.cat((x1, x2, x3, x4, x5), dim=1)
+
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = F.interpolate(x, size=(int(math.ceil(input.size()[-2]/4)),
+                                int(math.ceil(input.size()[-1]/4))), mode='bilinear', align_corners=True)
+
+        feature_4 = self.conv_4(feature_4)
+        x = torch.cat((x, feature_4), dim=1)
+        x = self.last_conv(x)
+
+        x = F.interpolate(x, size=input.size()[2:], mode='bilinear', align_corners=True)
+        return x
+
+    def freeze_bn(self):
+        for m in self.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eval()
+
+    def __init_weight(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                torch.nn.init.kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+
+class OutConv(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=1),
+            nn.BatchNorm2d(in_channels),
+            nn.ReLU(),
+            nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1)
+        )
+    
+    def forward(self, x):
+        return self.conv(x)
diff --git a/components/resnet.py b/components/resnet.py
new file mode 100644
index 0000000..f86ced1
--- /dev/null
+++ b/components/resnet.py
@@ -0,0 +1,148 @@
+import torch
+import torch.nn as nn
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, rate=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               dilation=rate, padding=rate, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.rate = rate
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(self, nInputChannels, block, layers, os=16):
+        """
+        os: output stride, the ratio of input image resolution to final output resolution
+        """
+
+        self.inplanes = 64
+        super(ResNet, self).__init__()
+        if os == 16:
+            strides = [1, 2, 2, 1]
+            rates = [1, 1, 1, 2]
+            blocks = [1, 2, 4]
+        elif os == 8:
+            strides = [1, 2, 1, 1]
+            rates = [1, 1, 2, 2]
+            blocks = [1, 2, 1]
+        else:
+            raise NotImplementedError
+
+        # Modules
+        self.conv1 = nn.Conv2d(nInputChannels, 64, kernel_size=7, stride=2, padding=3,
+                                bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        self.layer1 = self._make_layer(block, 64, layers[0], stride=strides[0], rate=rates[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=strides[1], rate=rates[1])
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=strides[2], rate=rates[2])
+        self.layer4 = self._make_MG_unit(block, 512, blocks=blocks, stride=strides[3], rate=rates[3])
+
+        self._init_weight()
+
+    def _make_layer(self, block, planes, blocks, stride=1, rate=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, rate, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _make_MG_unit(self, block, planes, blocks=[1,2,4], stride=1, rate=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, rate=blocks[0]*rate, downsample=downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, len(blocks)):
+            layers.append(block(self.inplanes, planes, stride=1, rate=blocks[i]*rate))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, input):
+        x = self.conv1(input)
+        x = self.bn1(x)
+        x = self.relu(x)
+        feature_2 = x
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        feature_4 = x
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        result = {
+            'out': x,
+            'feature_4': feature_4,
+            'feature_2': feature_2,
+        }
+        return result
+
+    def _init_weight(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                torch.nn.init.kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+
+def ResNet50(nInputChannels=3, os=8):
+    model = ResNet(nInputChannels, Bottleneck, [3, 4, 6, 3], os)
+    return model
+
+
+def ResNet101(nInputChannels=3, os=16):
+    model = ResNet(nInputChannels, Bottleneck, [3, 4, 23, 3], os)
+    return model
diff --git a/components/tracking.py b/components/tracking.py
new file mode 100644
index 0000000..a652301
--- /dev/null
+++ b/components/tracking.py
@@ -0,0 +1,40 @@
+import torch
+import numpy as np
+import pytorch_lightning as pl
+from torchvision import transforms
+
+
+class KeypointTracker(pl.LightningModule):
+    def __init__(self, predictor):
+        super().__init__()
+        self.predictor = predictor
+        self.transform = transforms.Normalize(
+            (0.485, 0.456, 0.406), 
+            (0.229, 0.224, 0.225))
+        self.predictor.eval()
+    
+    def process_input(self, input: np.ndarray, non_blocking=True):
+        """
+        Assuming input is a uint8 ndarray 0-255 with bgra format
+        N,H,W,C
+        """
+        with torch.no_grad():
+            return torch.from_numpy(input)[:,:,:,[2,1,0]].to(
+                    device=self.device, dtype=self.dtype, 
+                    non_blocking=non_blocking
+                    ).moveaxis(-1,1).contiguous() / 255
+
+    def forward(self, x: torch.Tensor):
+        """
+        Return coordinates, confidence
+        """
+        with torch.no_grad():
+            N,_,H,W = x.shape
+            x = self.predictor(self.transform(x))['scoremap']
+            C = x.shape[1]
+            confidence, x = torch.max(x.reshape(N,C,-1), axis=-1)
+            confidence = torch.sigmoid(confidence)
+            keypoints = torch.zeros((N,C,2), device=self.device, dtype=torch.int64)
+            keypoints[:,:,1] = torch.divide(x, W, rounding_mode='floor')
+            keypoints[:,:,0] = x % W
+            return keypoints, confidence
diff --git a/config/eval_irp_pred_accuracy.yaml b/config/eval_irp_pred_accuracy.yaml
new file mode 100644
index 0000000..bc7d132
--- /dev/null
+++ b/config/eval_irp_pred_accuracy.yaml
@@ -0,0 +1,32 @@
+datamodule:
+  zarr_path: data/irp_rope.zarr
+  name: traj_occu_rechunk_action
+  zarr_cache_size: null
+  # dataloader
+  dataloader_params:
+    batch_size: 16
+    num_workers: 8
+    prefetch_factor: 4
+    pin_memory: False
+  # async cpu to gpu transfer
+  async_device: null
+  # sample
+  action_sigma: 0.125
+  random_init: False
+  # image
+  distance: False
+  pix_per_m: 60.34
+  direction: False
+  size:
+    train: 10000
+    val: 10000
+    test: 100000
+model:
+  ckpt_path: data/checkpoints/irp_rope.ckpt
+  threshold: 0.2
+  gpu_id: 0
+wandb:
+  project: null
+  name: null
+  tags: []
+offline: True
diff --git a/config/eval_irp_rope_dataset.yaml b/config/eval_irp_rope_dataset.yaml
new file mode 100644
index 0000000..5b3a135
--- /dev/null
+++ b/config/eval_irp_rope_dataset.yaml
@@ -0,0 +1,32 @@
+setup:
+  zarr_path: data/irp_rope.zarr
+  name: 'traj_occu_rechunk_action'
+  selection:
+    n_ropes: 5
+    n_goals: 25
+    mask_names: ['split/is_test','split/is_interpolate']
+    seed: 0
+  n_steps: 16
+  random_init: True
+action:
+  ckpt_path: data/checkpoints/irp_rope.ckpt
+  late_fusion: False
+  gpu_id: 0
+  use_fp16: True
+  sampler:
+    num_samples: 128
+    seed: 0
+  selector:
+    batch_size: 32
+  gain: 0.5
+  sigma_max: 0.125
+  constant_sigma: null
+  threshold:
+    max: 0.2
+    min: 0.2
+    dist_max: 1.0
+wandb:
+  project: irp_eval
+  name: null
+  tags: []
+offline: True
diff --git a/config/eval_irp_rope_real.yaml b/config/eval_irp_rope_real.yaml
new file mode 100644
index 0000000..4b8f694
--- /dev/null
+++ b/config/eval_irp_rope_real.yaml
@@ -0,0 +1,59 @@
+setup:
+  goal_pixs: [[75,155],[150,180],[100,200],[65,180],[70,120]]
+  n_steps: 10
+  save_video: True
+env:
+  action:
+    swing_actor:
+      robot_ip: 192.168.0.139
+      q_init: [-90,-70,150,-170,-90,-90]
+      speed_range: [1.0,3.14]
+      j2_delta_range: [-60,-180]
+      j3_delta_range: [60,-120]
+      tcp_offset: 0.5
+    action_duration: 0.7
+    reset:
+      action: [0.9,1,1]
+      duration: 0.5
+      speed: 2.8
+  vision:
+    transformer:
+      zarr_path: data/irp_rope.zarr
+    projector:
+      calib_path: data/calibration.pkl
+      flip_x: True
+      confidence_threshold: 0.3
+    tracker:
+      ckpt_path: data/checkpoints/rope_tracking.ckpt
+      gpu_id: 0
+    ram_recorder:
+      maxlen: 6000
+    buff_tracker:
+      max_batch_size: 4
+action:
+  ckpt_path: data/checkpoints/irp_rope.ckpt
+  late_fusion: False
+  gpu_id: 0
+  use_fp16: True
+  sampler:
+    num_samples: 128
+    seed: 0
+  selector:
+    batch_size: 32
+  gain: 0.5
+  sigma_max: 0.125
+  constant_sigma: null
+  threshold:
+    max: 0.2
+    min: 0.2
+    dist_max: 1.0
+  init_action:
+    zarr_path: data/irp_rope.zarr
+    name: train_rope/best_action_coord
+    action_scale: 50
+    const_action: [0.2, 0.2, 0.6]
+wandb:
+  project: irp_real
+  name: null
+  tags: ['ours']
+offline: True
diff --git a/config/eval_irp_rope_sim.yaml b/config/eval_irp_rope_sim.yaml
new file mode 100644
index 0000000..4643670
--- /dev/null
+++ b/config/eval_irp_rope_sim.yaml
@@ -0,0 +1,53 @@
+env:
+  transformer:
+    low: [-3.0, -3.0]
+    high: [3.0, 3.0]
+    grid_shape: [256, 256]
+  xml_dir: data
+  template_path: assets/mujoco/ur5/ur5_stick_rope_v2.xml.jinja2
+  sim:
+    dt: 0.001
+    sim_duration: 4.0
+    subsample_rate: 10
+    visualize: False
+    kp: 3000
+    j_init_deg: [0,-70,150,-170,-90,0]
+  seed: 0
+  random_init: True
+  action:
+    speed_range: [1.0, 3.14]
+    j2_delta_range: [-1.04720, -3.14159]
+    j3_delta_range: [1.04720, -2.09440]
+    impulse_range: [-0.001, 0.001]
+    acceleration: 10.0
+setup:
+  zarr_path: data/irp_rope.zarr
+  name: 'traj_occu_rechunk_action'
+  selection:
+    n_ropes: 5
+    n_goals: 25
+    mask_names: ['split/is_test','split/is_interpolate']
+    seed: 0
+  n_steps: 16
+action:
+  ckpt_path: data/checkpoints/irp_rope.ckpt
+  late_fusion: False
+  gpu_id: 0
+  use_fp16: True
+  sampler:
+    num_samples: 128
+    seed: 0
+  selector:
+    batch_size: 32
+  gain: 0.5
+  sigma_max: 0.125
+  constant_sigma: null
+  threshold:
+    max: 0.2
+    min: 0.2
+    dist_max: 1.0
+wandb:
+  project: irp_eval
+  name: null
+  tags: []
+offline: True
diff --git a/config/train_irp.yaml b/config/train_irp.yaml
new file mode 100644
index 0000000..15c617b
--- /dev/null
+++ b/config/train_irp.yaml
@@ -0,0 +1,43 @@
+datamodule:
+  zarr_path: data/irp_rope.zarr
+  name: traj_occu_rechunk_action
+  zarr_cache_size: null
+  # dataloader
+  dataloader_params:
+    batch_size: 16
+    num_workers: 8
+    prefetch_factor: 4
+    pin_memory: False
+  # async cpu to gpu transfer
+  async_device: null
+  # sample
+  action_sigma: 0.125
+  random_init: False
+  # image
+  distance: False
+  pix_per_m: 60.34
+  direction: False
+  size:
+    train: 10000
+    val: 10000
+    test: 10000
+model:
+  action_sigma: null
+  learning_rate: 0.001
+  weight_decay: 0.000001
+  loss: 'bce'
+checkpoint_callback:
+  save_top_k: 10
+  every_n_epochs: 1
+vis_callback:
+  input_key: 'target_trajectory'
+  num_samples: 16
+  seed: 0
+trainer:
+  gpus: [0]
+  check_val_every_n_epoch: 1
+  val_check_interval: 1.0
+logger:
+  offline: True
+  name: null
+  tags: []
diff --git a/config/train_tracker.yaml b/config/train_tracker.yaml
new file mode 100644
index 0000000..88ade34
--- /dev/null
+++ b/config/train_tracker.yaml
@@ -0,0 +1,66 @@
+datamodule:
+  data_dir: data/tracking
+  train_split: 0.95
+  split_seed: 0
+  batch_size: 4
+  num_workers: 4
+  sigma: 2.0
+  imgaug_cfg:
+    apply_prob: 0.5
+    
+    crop_size: [400, 400]
+    max_shift: 0.4
+    crop_sampling: hybrid
+
+    cropratio: 0.4
+
+    rotation: 25
+    rotratio: 0.4
+
+    scale_jitter_lo: 0.5
+    scale_jitter_up: 1.25
+
+    mirror: True
+
+    clahe: True
+    claheratio: 0.1
+    histeq: True
+    histeqratio: 0.1
+
+    sharpen: True
+    sharpenratio: 0.3
+    edge: False
+    emboss:
+      alpha: [0.0, 1.0]
+      strength: [0.5, 1.5]
+      embossratio: 0.1
+
+    # sigmoid: True
+    # sigmoidratio: 0.1
+
+    # gamma: True
+    # gammaratio: 0.1
+
+    # linear: True
+    # linearratio: 0.1
+
+    gaussian_noise: True
+
+model:
+  learning_rate: 0.0005
+  weight_decay: 0.0001
+  upsample: True
+checkpoint_callback:
+  save_top_k: 10
+  every_n_epochs: 1
+vis_callback:
+  num_samples: 8
+  seed: 0
+trainer:
+  gpus: [0]
+  check_val_every_n_epoch: 1
+  val_check_interval: 1.0
+logger:
+  offline: True
+  name: null
+  tags: []
diff --git a/datasets/delta_trajectory_gaussian_dataset.py b/datasets/delta_trajectory_gaussian_dataset.py
new file mode 100644
index 0000000..daab825
--- /dev/null
+++ b/datasets/delta_trajectory_gaussian_dataset.py
@@ -0,0 +1,213 @@
+import os
+import copy
+
+import numpy as np
+from numpy.random.mtrand import random
+from torch.utils.data.dataloader import DataLoader
+import zarr
+import torch
+import torch.utils.data
+import pytorch_lightning as pl
+from hydra.utils import to_absolute_path
+
+from common.sample_util import VirtualSampleGrid, get_nd_index_volume
+from common.async_dataloader import AsynchronousLoader
+from common.cv_util import get_dist_function_precise
+from common.zarr_util import open_cached
+
+class DeltaTrajectoryGaussianDataset(torch.utils.data.Dataset):
+    def __init__(self,
+        # data
+        zarr_path: str,
+        name: str = 'traj_occu',
+        zarr_cache_size: str = '4GB',
+        # sample setup
+        rope_coords = None,
+        is_setup_valid = None,
+        action_sigma = 0.125,
+        random_init = True,
+        size = 10000,
+        # image setup
+        distance = False,
+        pix_per_m = 60.34,
+        direction = False,
+        # training params
+        static_epoch_seed: bool = False,
+        **kwargs):
+        super().__init__()
+
+        assert(not (distance and direction))
+        path = os.path.expanduser(to_absolute_path(zarr_path))
+        assert(os.path.isdir(path))
+        root = open_cached(path, mode='r', cache_size=zarr_cache_size)
+        data_array = root[name]
+
+        assert(rope_coords is not None)
+        assert(is_setup_valid is not None)
+        
+        self.data_array = data_array
+        self.rope_coords = rope_coords
+        self.is_setup_valid = is_setup_valid
+        self.static_epoch_seed = static_epoch_seed
+        self.action_sigma = action_sigma
+        self.random_init = random_init
+        self.size = size
+        self.distance = distance
+        self.pix_per_m = pix_per_m
+        self.direction = direction
+    
+    def __len__(self):
+        return self.size
+
+    def __getitem__(self, idx: int) -> dict:
+        data_array = self.data_array
+        rope_coords = self.rope_coords
+        is_setup_valid = self.is_setup_valid
+        static_epoch_seed = self.static_epoch_seed
+        action_sigma = self.action_sigma
+        random_init = self.random_init
+
+        action_shape = data_array.shape[2:5]
+        n_init = data_array.shape[5]
+
+        seed = idx if static_epoch_seed else None
+        rs = np.random.RandomState(seed=seed)
+
+        # select rope
+        rope_coord = tuple(rope_coords[rs.choice(len(rope_coords))].tolist())
+
+        # rejection sample base action
+        n_action = np.prod(action_shape)
+        base_rs = np.random.RandomState(seed=rs.randint(0,2**16))
+        while True:
+            base_action_idx = base_rs.randint(0, n_action)
+            base_action_coord = np.unravel_index(
+                base_action_idx, action_shape)
+            coord = rope_coord + base_action_coord
+            if is_setup_valid[coord]:
+                break
+
+        # rejection sample delta action
+        action_scale = np.array(action_shape)
+        delta_rs = np.random.RandomState(seed=rs.randint(0,2**16))
+        while True:
+            delta_action = delta_rs.normal(loc=0, scale=action_sigma, size=3)
+            delta_coord = (action_scale * delta_action).round().astype(np.int64)
+            next_coord = np.array(base_action_coord) + delta_coord
+            if np.any(next_coord < 0) or np.any(
+                next_coord >= np.array(action_shape)):
+                continue
+            next_action_coord = tuple(next_coord.tolist())
+            coord = rope_coord + next_action_coord
+            if is_setup_valid[coord]:
+                break
+
+        # get init
+        init_coord = (n_init // 2,)
+        if random_init:
+            init_coord = (int(rs.randint(0, n_init)),)
+
+        # get data
+        input_trajectory = data_array[
+            rope_coord + base_action_coord + init_coord]
+        delta_trajectory = data_array[
+            rope_coord + next_action_coord + init_coord]
+
+        # convert data
+        if self.distance:
+            pix_per_m = self.pix_per_m
+            input_df = get_dist_function_precise(input_trajectory) / pix_per_m
+            delta_df = get_dist_function_precise(delta_trajectory) / pix_per_m
+            input_trajectory = input_df
+            delta_trajectory = delta_df
+        elif self.direction:
+            input_df = get_dist_function_precise(input_trajectory)
+            delta_df = get_dist_function_precise(delta_trajectory)
+            is_closer = delta_df < input_df
+            delta_trajectory = is_closer
+        
+        action_delta = delta_coord / action_scale
+        data = {
+            'input_trajectory': np.expand_dims(input_trajectory, axis=0).astype(np.float32),
+            'action_delta': action_delta.astype(np.float32),
+            'target_trajectory': np.expand_dims(delta_trajectory, axis=0).astype(np.float32)
+        }
+
+        data_torch = dict()
+        for key, value in data.items():
+            data_torch[key] = torch.from_numpy(value)
+        return data_torch
+
+
+class DeltaTrajectoryGaussianDataModule(pl.LightningDataModule):
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.kwargs = kwargs
+        self.set_rope_coords = None
+        self.is_setup_valid = None
+
+    def prepare_data(self):
+        kwargs = self.kwargs
+        zarr_path = os.path.expanduser(
+            to_absolute_path(kwargs['zarr_path']))
+        root = zarr.open(zarr_path, 'r')
+
+        is_setup_valid = root['is_valid'][:]
+        self.is_setup_valid = is_setup_valid
+
+        is_rope_train = root['split/is_train'][:]
+        is_rope_val = root['split/is_val'][:]
+        is_rope_test = root['split/is_test'][:]
+        
+        rope_shape = is_setup_valid.shape[:2]
+        rope_coords_volume = get_nd_index_volume(rope_shape)
+        set_rope_coords = {
+            'train': rope_coords_volume[is_rope_train],
+            'val': rope_coords_volume[is_rope_val],
+            'test': rope_coords_volume[is_rope_test]
+        }
+        self.set_rope_coords = set_rope_coords
+    
+    def get_dataset(self, set_name: str):
+        assert(set_name in ['train', 'val', 'test'])
+        kwargs = self.kwargs
+
+        static_epoch_seed = (set_name != 'train')
+        dataset_kwargs = copy.deepcopy(kwargs)
+        dataset_kwargs['rope_coords'] = self.set_rope_coords[set_name]
+        dataset_kwargs['is_setup_valid'] = self.is_setup_valid
+        dataset_kwargs['size'] = kwargs['size'][set_name]
+        dataset_kwargs['static_epoch_seed'] = static_epoch_seed
+        dataset = DeltaTrajectoryGaussianDataset(**dataset_kwargs)
+        return dataset
+    
+    def get_dataloader(self, set_name: str):
+        assert(set_name in ['train', 'val', 'test'])
+        kwargs = self.kwargs
+        dataloader_params = kwargs['dataloader_params']
+
+        is_train = (set_name == 'train')
+        dataset = self.get_dataset(set_name)
+
+        dataloader = DataLoader(
+            dataset,
+            shuffle=is_train,
+            drop_last=is_train,
+            **dataloader_params)
+
+        if kwargs['async_device'] is not None:
+            device_id = kwargs['async_device']
+            device = torch.device('cuda', device_id)
+            dataloader = AsynchronousLoader(
+                data=dataloader, device=device)
+
+        return dataloader
+
+    def train_dataloader(self):
+        return self.get_dataloader('train')
+
+    def val_dataloader(self):
+        return self.get_dataloader('val')
+    
+    def test_dataloader(self):
+        return self.get_dataloader('test')
diff --git a/datasets/keypoint_augumentation.py b/datasets/keypoint_augumentation.py
new file mode 100644
index 0000000..555dac8
--- /dev/null
+++ b/datasets/keypoint_augumentation.py
@@ -0,0 +1,209 @@
+import imgaug.augmenters as iaa
+from imgaug.augmentables import Keypoint
+import numpy as np
+import scipy.ndimage as sn
+
+
+from common.sample_util import get_nd_index_volume
+
+
+def build_augmentation_pipeline(cfg: dict, height=None, width=None, apply_prob=0.5):
+    sometimes = lambda aug: iaa.Sometimes(apply_prob, aug)
+    pipeline = iaa.Sequential(random_order=False)
+
+    if cfg.get("mirror", None):
+        opt = cfg.get("mirror", None)  # fliplr
+        if type(opt) == int:
+            pipeline.add(sometimes(iaa.Fliplr(opt)))
+        else:
+            pipeline.add(sometimes(iaa.Fliplr(0.5)))
+
+    if cfg.get("rotation", None) > 0:
+        pipeline.add(
+            iaa.Sometimes(
+                cfg.get("rotratio", None),
+                iaa.Affine(rotate=(-cfg.get("rotation", None), cfg.get("rotation", None))),
+            )
+        )
+
+    if cfg.get("motion_blur", None):
+        opts = cfg.get("motion_blur_params", None)
+        pipeline.add(sometimes(iaa.MotionBlur(**opts)))
+
+    if cfg.get("covering", None):
+        pipeline.add(
+            sometimes(iaa.CoarseDropout(0.02, size_percent=0.3, per_channel=0.5))
+        )
+
+    if cfg.get("elastic_transform", None):
+        pipeline.add(sometimes(iaa.ElasticTransformation(sigma=5)))
+
+    if cfg.get("gaussian_noise", False):
+        opt = cfg.get("gaussian_noise", False)
+        if type(opt) == int or type(opt) == float:
+            pipeline.add(
+                sometimes(
+                    iaa.AdditiveGaussianNoise(
+                        loc=0, scale=(0.0, opt), per_channel=0.5
+                    )
+                )
+            )
+        else:
+            pipeline.add(
+                sometimes(
+                    iaa.AdditiveGaussianNoise(
+                        loc=0, scale=(0.0, 0.05 * 255), per_channel=0.5
+                    )
+                )
+            )
+    if cfg.get("grayscale", False):
+        pipeline.add(sometimes(iaa.Grayscale(alpha=(0.5, 1.0))))
+
+    def get_aug_param(cfg_value):
+        if isinstance(cfg_value, dict):
+            opt = cfg_value
+        else:
+            opt = {}
+        return opt
+
+    cfg_cnt = cfg.get("contrast", {})
+    cfg_cnv = cfg.get("convolution", {})
+
+    contrast_aug = ["histeq", "clahe", "gamma", "sigmoid", "log", "linear"]
+    for aug in contrast_aug:
+        aug_val = cfg_cnt.get(aug, False)
+        cfg_cnt[aug] = aug_val
+        if aug_val:
+            cfg_cnt[aug + "ratio"] = cfg_cnt.get(aug + "ratio", 0.1)
+
+    convolution_aug = ["sharpen", "emboss", "edge"]
+    for aug in convolution_aug:
+        aug_val = cfg_cnv.get(aug, False)
+        cfg_cnv[aug] = aug_val
+        if aug_val:
+            cfg_cnv[aug + "ratio"] = cfg_cnv.get(aug + "ratio", 0.1)
+
+    if cfg_cnt["histeq"]:
+        opt = get_aug_param(cfg_cnt["histeq"])
+        pipeline.add(
+            iaa.Sometimes(
+                cfg_cnt["histeqratio"], iaa.AllChannelsHistogramEqualization(**opt)
+            )
+        )
+
+    if cfg_cnt["clahe"]:
+        opt = get_aug_param(cfg_cnt["clahe"])
+        pipeline.add(
+            iaa.Sometimes(cfg_cnt["claheratio"], iaa.AllChannelsCLAHE(**opt))
+        )
+
+    if cfg_cnt["log"]:
+        opt = get_aug_param(cfg_cnt["log"])
+        pipeline.add(iaa.Sometimes(cfg_cnt["logratio"], iaa.LogContrast(**opt)))
+
+    if cfg_cnt["linear"]:
+        opt = get_aug_param(cfg_cnt["linear"])
+        pipeline.add(
+            iaa.Sometimes(cfg_cnt["linearratio"], iaa.LinearContrast(**opt))
+        )
+
+    if cfg_cnt["sigmoid"]:
+        opt = get_aug_param(cfg_cnt["sigmoid"])
+        pipeline.add(
+            iaa.Sometimes(cfg_cnt["sigmoidratio"], iaa.SigmoidContrast(**opt))
+        )
+
+    if cfg_cnt["gamma"]:
+        opt = get_aug_param(cfg_cnt["gamma"])
+        pipeline.add(iaa.Sometimes(cfg_cnt["gammaratio"], iaa.GammaContrast(**opt)))
+
+    if cfg_cnv["sharpen"]:
+        opt = get_aug_param(cfg_cnv["sharpen"])
+        pipeline.add(iaa.Sometimes(cfg_cnv["sharpenratio"], iaa.Sharpen(**opt)))
+
+    if cfg_cnv["emboss"]:
+        opt = get_aug_param(cfg_cnv["emboss"])
+        pipeline.add(iaa.Sometimes(cfg_cnv["embossratio"], iaa.Emboss(**opt)))
+
+    if cfg_cnv["edge"]:
+        opt = get_aug_param(cfg_cnv["edge"])
+        pipeline.add(iaa.Sometimes(cfg_cnv["edgeratio"], iaa.EdgeDetect(**opt)))
+
+    if height is not None and width is not None:
+        if not cfg.get("crop_by", False):
+            crop_by = 0.15
+        else:
+            crop_by = cfg.get("crop_by", False)
+        pipeline.add(
+            iaa.Sometimes(
+                cfg.get("cropratio", 0.4),
+                iaa.CropAndPad(percent=(-crop_by, crop_by), keep_size=False),
+            )
+        )
+        pipeline.add(iaa.Resize({"height": height, "width": width}))
+    return pipeline
+
+
+def get_gaussian_scoremap(shape, keypoint: np.ndarray, sigma=5, dtype=np.float32):
+    """
+    keypoint is float32
+    """
+    coord_img = get_nd_index_volume(shape).astype(dtype)
+    sqrt_dist_img = np.square(np.linalg.norm(
+        coord_img - keypoint[::-1].astype(dtype), axis=-1))
+    scoremap = np.exp(-0.5/np.square(sigma)*sqrt_dist_img)
+    return scoremap
+
+
+def compute_target_part_scoremap_numpy(
+    joint_id, coords, size):
+    stride = 8
+    half_stride = 4
+    locref_scale = 1 / 7.2801
+    pos_dist_thresh = 17
+    scale = 0.8
+
+    dist_thresh = float(pos_dist_thresh * scale)
+    dist_thresh_sq = dist_thresh ** 2
+    num_joints = 1
+
+    scmap = np.zeros(np.concatenate([size, np.array([num_joints])]))
+    locref_size = np.concatenate([size, np.array([num_joints * 2])])
+    locref_mask = np.zeros(locref_size)
+    locref_map = np.zeros(locref_size)
+
+    width = size[1]
+    height = size[0]
+    grid = np.mgrid[:height, :width].transpose((1, 2, 0))
+
+    for person_id in range(len(coords)):
+        # for k, j_id in enumerate(joint_id[person_id]):
+        k = 0
+        j_id = 0
+
+        joint_pt = coords[person_id][k, :]
+        j_x = np.asscalar(joint_pt[0])
+        j_x_sm = round((j_x - half_stride) / stride)
+        j_y = np.asscalar(joint_pt[1])
+        j_y_sm = round((j_y - half_stride) / stride)
+        min_x = round(max(j_x_sm - dist_thresh - 1, 0))
+        max_x = round(min(j_x_sm + dist_thresh + 1, width - 1))
+        min_y = round(max(j_y_sm - dist_thresh - 1, 0))
+        max_y = round(min(j_y_sm + dist_thresh + 1, height - 1))
+        x = grid.copy()[:, :, 1]
+        y = grid.copy()[:, :, 0]
+        dx = j_x - x * stride - half_stride
+        dy = j_y - y * stride - half_stride
+        dist = dx ** 2 + dy ** 2
+        mask1 = dist <= dist_thresh_sq
+        mask2 = (x >= min_x) & (x <= max_x)
+        mask3 = (y >= min_y) & (y <= max_y)
+        mask = mask1 & mask2 & mask3
+        scmap[mask, j_id] = 1
+        locref_mask[mask, j_id * 2 + 0] = 1
+        locref_mask[mask, j_id * 2 + 1] = 1
+        locref_map[mask, j_id * 2 + 0] = (dx * locref_scale)[mask]
+        locref_map[mask, j_id * 2 + 1] = (dy * locref_scale)[mask]
+
+    weights = np.np.ones(scmap.shape)
+    return scmap, weights, locref_map, locref_mask
diff --git a/datasets/keypoint_imgaug_dataset.py b/datasets/keypoint_imgaug_dataset.py
new file mode 100644
index 0000000..e409e8e
--- /dev/null
+++ b/datasets/keypoint_imgaug_dataset.py
@@ -0,0 +1,180 @@
+import os
+import copy
+from pathlib import Path
+import json
+
+import numpy as np
+import pandas as pd
+from torch.utils.data.dataloader import DataLoader
+import torch
+import torch.utils.data
+import pytorch_lightning as pl
+from skimage.io import imread
+from torchvision import transforms
+import imgaug.augmenters as iaa
+from hydra.utils import to_absolute_path
+
+from datasets.keypoint_augumentation import (
+    build_augmentation_pipeline, get_gaussian_scoremap)
+
+
+class KeypointImgaugDataset(torch.utils.data.Dataset):
+    def __init__(self, 
+        sample_df: pd.DataFrame,
+        imgaug_cfg: dict,
+        enable_aug: bool = True,
+        sigma: float = 5,
+        **kwargs):
+        super().__init__()
+
+        assert(len(sample_df) > 0)
+        sample_item = sample_df.iloc[0]
+        sample_img_path = sample_item['image_path']
+        sample_img = imread(sample_img_path)
+        img_shape = sample_img.shape[:2]
+        height, width = img_shape
+
+        imgaug_pipeline = iaa.Sequential([])
+        if enable_aug:
+            apply_prob = imgaug_cfg.get('apply_prob', 0.5)
+            imgaug_pipeline = build_augmentation_pipeline(
+                imgaug_cfg, 
+                height=height, width=width, 
+                apply_prob=apply_prob)
+        
+        def normalize(img):
+            mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
+            std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
+            result = (img - mean) / std
+            return result
+
+        self.image_transform = normalize
+        self.sample_df = sample_df
+        self.imgaug_pipeline = imgaug_pipeline
+        self.sigma = sigma
+        self.kwargs = kwargs
+    
+    def __len__(self):
+        return len(self.sample_df)
+    
+    def __getitem__(self, idx: int) -> dict:
+        sample_df = self.sample_df
+        imgaug_pipeline = self.imgaug_pipeline
+        sigma = self.sigma
+
+        item = sample_df.iloc[idx]
+        keypoint = item['keypoint']
+        img_path = item['image_path']
+        img = imread(img_path).astype(np.float32) / 255
+        img_norm = self.image_transform(img)
+
+        img_aug, kp_aug = imgaug_pipeline(image=img_norm, keypoints=keypoint)
+        kp_aug = np.array(kp_aug, dtype=np.float32)
+        target_scoremap = get_gaussian_scoremap(
+            img_aug.shape[:2], 
+            keypoint=kp_aug,
+            sigma=sigma)
+
+        data = {
+            'input_image': np.moveaxis(img_aug, -1, 0),
+            'target_scoremap': np.expand_dims(target_scoremap, axis=0),
+            'target_keypoint': kp_aug
+        }
+        data_torch = dict()
+        for key, value in data.items():
+            data_torch[key] = torch.from_numpy(value)
+        return data_torch
+
+
+def get_data_df(data_dir, json_name='labels.json', img_format='jpg'):
+    rows_dict = dict()
+    for json_path in Path(data_dir).glob("*/*"+json_name):
+        dir_path = json_path.parent
+        this_dir_name = dir_path.name
+        # read images
+        this_imgs_dict = dict()
+        for img_path in dir_path.glob('*.'+img_format):
+            this_imgs_dict[img_path.stem] = str(img_path.absolute())
+        this_labels_dict = json.load(json_path.open('r'))
+        # key intersection
+        valid_keys = this_imgs_dict.keys() & this_labels_dict.keys()
+        for key in valid_keys:
+            key_tuple = (this_dir_name,) + tuple(key.split('_'))
+            rows_dict[key_tuple] = {
+                'keypoint': tuple(this_labels_dict[key]),
+                'image_path': this_imgs_dict[key]
+            }
+    data_df = pd.DataFrame(
+        data=rows_dict.values(), 
+        index=rows_dict.keys())
+    data_df.sort_index(inplace=True)
+    return data_df
+
+
+class KeypointImgaugDataModule(pl.LightningDataModule):
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.kwargs = kwargs
+        self.sample_dfs = None
+    
+    def prepare_data(self):
+        kwargs = self.kwargs
+        data_dir = os.path.expanduser(
+            to_absolute_path(kwargs['data_dir']))
+        data_df = get_data_df(data_dir)
+
+        # split
+        train_split = kwargs.get('train_split', 0.9)
+        num_train = int(len(data_df) * train_split)
+
+        split_seed = kwargs.get('split_seed', 0)
+        rs = np.random.RandomState(seed=split_seed)
+        all_idxs = rs.permutation(len(data_df))
+        train_idxs = all_idxs[:num_train]
+        val_idxs = all_idxs[num_train:]
+
+        train_df = data_df.iloc[train_idxs]
+        val_df = data_df.iloc[val_idxs]
+
+        sample_dfs = {
+            'train': train_df,
+            'val': val_df
+        }
+
+        self.sample_dfs = sample_dfs
+
+    def get_dataset(self, set_name: str, **kwargs):
+        assert(set_name in ['train', 'val'])
+        sample_dfs = self.sample_dfs
+        sample_df = sample_dfs[set_name]
+
+        enable_aug = (set_name == 'train')
+        dataset_kwargs = copy.deepcopy(self.kwargs)
+        dataset_kwargs['sample_df'] = sample_df
+        dataset_kwargs['enable_aug'] = enable_aug
+        dataset_kwargs.update(**kwargs)
+        dataset = KeypointImgaugDataset(**dataset_kwargs)
+        return dataset
+    
+    def get_dataloader(self, set_name: str):
+        assert(set_name in ['train', 'val'])
+        kwargs = self.kwargs
+        batch_size = kwargs['batch_size']
+        num_workers = kwargs['num_workers']
+
+        is_train = (set_name == 'train')
+        dataset = self.get_dataset(set_name)
+
+        dataloader = DataLoader(
+            dataset,
+            batch_size=batch_size,
+            shuffle=is_train,
+            drop_last=is_train,
+            num_workers=num_workers)
+        return dataloader
+
+    def train_dataloader(self):
+        return self.get_dataloader('train')
+
+    def val_dataloader(self):
+        return self.get_dataloader('val')
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..85db9b1
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,254 @@
+name: irp
+channels:
+  - conda-forge
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=1_llvm
+  - absl-py=1.0.0=pyhd8ed1ab_0
+  - aiohttp=3.8.1=py38h0a891b7_1
+  - aiosignal=1.2.0=pyhd8ed1ab_0
+  - alsa-lib=1.2.3=h516909a_0
+  - antlr-python-runtime=4.8=py38h32f6830_2
+  - aom=3.3.0=h27087fc_1
+  - argh=0.26.2=py38_1001
+  - asciitree=0.3.3=py_2
+  - async-timeout=4.0.2=pyhd8ed1ab_0
+  - attrs=21.4.0=pyhd8ed1ab_0
+  - blinker=1.4=py_1
+  - blosc=1.21.0=h9c3ff4c_0
+  - brotli=1.0.9=h166bdaf_7
+  - brotli-bin=1.0.9=h166bdaf_7
+  - brotlipy=0.7.0=py38h0a891b7_1004
+  - brunsli=0.1=h9c3ff4c_0
+  - bzip2=1.0.8=h7f98852_4
+  - c-ares=1.18.1=h7f98852_0
+  - c-blosc2=2.0.4=h5f21a17_1
+  - ca-certificates=2021.10.8=ha878542_0
+  - cachetools=5.0.0=pyhd8ed1ab_0
+  - cairo=1.16.0=h6cf1ce9_1008
+  - certifi=2021.10.8=py38h578d9bd_2
+  - cffi=1.15.0=py38h3931269_0
+  - cfitsio=4.1.0=hd9d235c_0
+  - charls=2.3.4=h9c3ff4c_0
+  - charset-normalizer=2.0.12=pyhd8ed1ab_0
+  - click=8.1.2=py38h578d9bd_0
+  - cloudpickle=2.0.0=pyhd8ed1ab_0
+  - colorama=0.4.4=pyh9f0ad1d_0
+  - configparser=5.2.0=pyhd8ed1ab_0
+  - cryptography=36.0.2=py38h2b5fc30_1
+  - cudatoolkit=11.2.0=h73cb219_9
+  - cycler=0.11.0=pyhd8ed1ab_0
+  - cytoolz=0.11.2=py38h0a891b7_2
+  - dask-core=2022.4.1=pyhd8ed1ab_0
+  - dbus=1.13.6=h5008d03_3
+  - docker-pycreds=0.4.0=py_0
+  - expat=2.4.8=h27087fc_0
+  - fasteners=0.17.3=pyhd8ed1ab_0
+  - ffmpeg=4.3.2=h37c90e5_3
+  - fontconfig=2.14.0=h8e229c2_0
+  - fonttools=4.32.0=py38h0a891b7_0
+  - freetype=2.10.4=h0708190_1
+  - frozenlist=1.3.0=py38h0a891b7_1
+  - fsspec=2022.3.0=pyhd8ed1ab_0
+  - future=0.18.2=py38h578d9bd_5
+  - geos=3.10.2=h9c3ff4c_0
+  - gettext=0.19.8.1=h73d1719_1008
+  - giflib=5.2.1=h36c2ea0_2
+  - gitdb=4.0.9=pyhd8ed1ab_0
+  - gitpython=3.1.27=pyhd8ed1ab_0
+  - gmp=6.2.1=h58526e2_0
+  - gnutls=3.6.13=h85f3911_1
+  - google-auth=2.6.5=pyh6c4a22f_0
+  - google-auth-oauthlib=0.4.6=pyhd8ed1ab_0
+  - gql=3.2.0=pyhd8ed1ab_0
+  - graphite2=1.3.13=h58526e2_1001
+  - graphql-core=3.2.1=pyhd8ed1ab_0
+  - grpcio=1.45.0=py38ha0cdfde_0
+  - gst-plugins-base=1.18.5=hf529b03_3
+  - gstreamer=1.18.5=h9f60fe5_3
+  - harfbuzz=2.9.1=h83ec7ef_1
+  - hdf5=1.10.6=nompi_h6a2412b_1114
+  - hydra-core=1.1.1=pyhd8ed1ab_0
+  - icu=68.2=h9c3ff4c_0
+  - idna=3.3=pyhd8ed1ab_0
+  - imagecodecs=2022.2.22=py38h43af60f_4
+  - imageio=2.16.2=pyhcf75d05_0
+  - imgaug=0.4.0=pyhd8ed1ab_1
+  - importlib-metadata=4.11.3=py38h578d9bd_1
+  - importlib_resources=5.7.1=pyhd8ed1ab_0
+  - jasper=1.900.1=h07fcdf6_1006
+  - jbig=2.1=h7f98852_2003
+  - jinja2=3.1.1=pyhd8ed1ab_0
+  - jpeg=9e=h7f98852_0
+  - jxrlib=1.1=h7f98852_2
+  - keyutils=1.6.1=h166bdaf_0
+  - kiwisolver=1.4.2=py38h43d8883_1
+  - krb5=1.19.3=h3790be6_0
+  - lame=3.100=h7f98852_1001
+  - lcms2=2.12=hddcbb42_0
+  - ld_impl_linux-64=2.36.1=hea4e1c9_2
+  - lerc=3.0=h9c3ff4c_0
+  - libaec=1.0.6=h9c3ff4c_0
+  - libavif=0.10.1=h166bdaf_0
+  - libblas=3.9.0=12_linux64_mkl
+  - libbrotlicommon=1.0.9=h166bdaf_7
+  - libbrotlidec=1.0.9=h166bdaf_7
+  - libbrotlienc=1.0.9=h166bdaf_7
+  - libcblas=3.9.0=12_linux64_mkl
+  - libclang=11.1.0=default_ha53f305_1
+  - libcurl=7.82.0=h7bff187_0
+  - libdeflate=1.10=h7f98852_0
+  - libedit=3.1.20191231=he28a2e2_2
+  - libev=4.33=h516909a_1
+  - libevent=2.1.10=h9b69904_4
+  - libffi=3.4.2=h7f98852_5
+  - libgcc-ng=11.2.0=h1d223b6_15
+  - libgfortran-ng=11.2.0=h69a702a_15
+  - libgfortran5=11.2.0=h5c6108e_15
+  - libglib=2.70.2=h174f98d_4
+  - libgomp=11.2.0=h1d223b6_15
+  - libiconv=1.16=h516909a_0
+  - liblapack=3.9.0=12_linux64_mkl
+  - liblapacke=3.9.0=12_linux64_mkl
+  - libllvm11=11.1.0=hf817b99_3
+  - libnghttp2=1.47.0=h727a467_0
+  - libnsl=2.0.0=h7f98852_0
+  - libogg=1.3.4=h7f98852_1
+  - libopencv=4.5.2=py38hd24989e_1
+  - libopus=1.3.1=h7f98852_1
+  - libpng=1.6.37=h21135ba_2
+  - libpq=13.5=hd57d9b9_1
+  - libprotobuf=3.16.0=h780b84a_0
+  - libssh2=1.10.0=ha56f1ee_2
+  - libstdcxx-ng=11.2.0=he4da1e4_15
+  - libtiff=4.3.0=h542a066_3
+  - libuuid=2.32.1=h7f98852_1000
+  - libvorbis=1.3.7=h9c3ff4c_0
+  - libwebp=1.2.2=h3452ae3_0
+  - libwebp-base=1.2.2=h7f98852_1
+  - libxcb=1.13=h7f98852_1004
+  - libxkbcommon=1.0.3=he3ba5ed_0
+  - libxml2=2.9.12=h72842e0_0
+  - libzlib=1.2.11=h166bdaf_1014
+  - libzopfli=1.0.3=h9c3ff4c_0
+  - llvm-openmp=13.0.1=he0ac6c6_1
+  - locket=0.2.0=py_2
+  - lz4-c=1.9.3=h9c3ff4c_1
+  - markdown=3.3.6=pyhd8ed1ab_0
+  - markupsafe=2.1.1=py38h0a891b7_1
+  - matplotlib=3.5.1=py38h578d9bd_0
+  - matplotlib-base=3.5.1=py38hf4fb855_0
+  - mkl=2021.4.0=h8d4b97c_729
+  - msgpack-python=1.0.3=py38h43d8883_1
+  - multidict=6.0.2=py38h0a891b7_1
+  - munkres=1.1.4=pyh9f0ad1d_0
+  - mysql-common=8.0.28=haf5c9bc_3
+  - mysql-libs=8.0.28=h28c427c_3
+  - ncurses=6.3=h27087fc_1
+  - nettle=3.6=he412f7d_0
+  - networkx=2.7.1=pyhd8ed1ab_0
+  - ninja=1.10.2=h4bd325d_1
+  - nspr=4.32=h9c3ff4c_1
+  - nss=3.77=h2350873_0
+  - numcodecs=0.9.1=py38h709712a_2
+  - numpy=1.21.1=py38h9894fe3_0
+  - nvidia-ml=7.352.0=py_0
+  - oauthlib=3.2.0=pyhd8ed1ab_0
+  - omegaconf=2.1.1=py38h578d9bd_1
+  - opencv=4.5.2=py38h578d9bd_1
+  - openh264=2.1.1=h780b84a_0
+  - openjpeg=2.4.0=hb52868f_1
+  - openssl=1.1.1n=h166bdaf_0
+  - packaging=21.3=pyhd8ed1ab_0
+  - pandas=1.4.2=py38h47df419_1
+  - partd=1.2.0=pyhd8ed1ab_0
+  - pathtools=0.1.2=py_1
+  - pcre=8.45=h9c3ff4c_0
+  - pillow=9.1.0=py38h0ee0e06_2
+  - pip=22.0.4=pyhd8ed1ab_0
+  - pixman=0.40.0=h36c2ea0_0
+  - promise=2.3=py38h578d9bd_6
+  - protobuf=3.16.0=py38h709712a_0
+  - psutil=5.9.0=py38h0a891b7_1
+  - pthread-stubs=0.4=h36c2ea0_1001
+  - py-opencv=4.5.2=py38hd0cf306_1
+  - pyasn1=0.4.8=py_0
+  - pyasn1-modules=0.2.7=py_0
+  - pycparser=2.21=pyhd8ed1ab_0
+  - pydeprecate=0.3.1=pyhd8ed1ab_0
+  - pyjwt=2.3.0=pyhd8ed1ab_1
+  - pyopenssl=22.0.0=pyhd8ed1ab_0
+  - pyparsing=3.0.8=pyhd8ed1ab_0
+  - pyqt=5.12.3=py38h578d9bd_8
+  - pyqt-impl=5.12.3=py38h0ffb2e6_8
+  - pyqt5-sip=4.19.18=py38h709712a_8
+  - pyqtchart=5.12=py38h7400c14_8
+  - pyqtwebengine=5.12.1=py38h7400c14_8
+  - pysocks=1.7.1=py38h578d9bd_5
+  - python=3.8.13=h582c2e5_0_cpython
+  - python-dateutil=2.8.2=pyhd8ed1ab_0
+  - python_abi=3.8=2_cp38
+  - pytorch=1.9.0=cpu_py38h4bbe6ce_2
+  - pytorch-cpu=1.9.0=cpu_py38h718b53a_2
+  - pytorch-lightning=1.4.4=pyhd8ed1ab_0
+  - pytz=2022.1=pyhd8ed1ab_0
+  - pyu2f=0.1.5=pyhd8ed1ab_0
+  - pywavelets=1.3.0=py38h71d37f0_1
+  - pyyaml=6.0=py38h0a891b7_4
+  - qt=5.12.9=hda022c4_4
+  - readline=8.1=h46c0cb4_0
+  - requests=2.27.1=pyhd8ed1ab_0
+  - requests-oauthlib=1.3.1=pyhd8ed1ab_0
+  - rsa=4.8=pyhd8ed1ab_0
+  - scikit-image=0.19.2=py38h43a58ef_0
+  - scikit-video=1.1.11=pyh24bf2e0_0
+  - scipy=1.7.1=py38h56a6a73_0
+  - sentry-sdk=1.5.10=pyhd8ed1ab_0
+  - setuptools=62.1.0=py38h578d9bd_0
+  - shapely=1.8.0=py38h97f7145_6
+  - shortuuid=1.0.8=py38h578d9bd_0
+  - six=1.16.0=pyh6c4a22f_0
+  - sleef=3.5.1=h9b69904_2
+  - smmap=3.0.5=pyh44b312d_0
+  - snappy=1.1.8=he1b5a44_3
+  - sqlite=3.38.2=h4ff8645_0
+  - subprocess32=3.5.4=py_1
+  - tbb=2021.5.0=h924138e_1
+  - tensorboard=2.8.0=pyhd8ed1ab_1
+  - tensorboard-data-server=0.6.0=py38h2b5fc30_2
+  - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
+  - tifffile=2022.4.8=pyhd8ed1ab_0
+  - tk=8.6.12=h27826a3_0
+  - toolz=0.11.2=pyhd8ed1ab_0
+  - torchmetrics=0.8.0=pyhd8ed1ab_0
+  - torchvision=0.10.1=py38h9e2e28c_0_cpu
+  - tornado=6.1=py38h0a891b7_3
+  - tqdm=4.64.0=pyhd8ed1ab_0
+  - typing-extensions=4.1.1=hd8ed1ab_0
+  - typing_extensions=4.1.1=pyha770c72_0
+  - unicodedata2=14.0.0=py38h0a891b7_1
+  - urllib3=1.26.9=pyhd8ed1ab_0
+  - wandb=0.11.2=pyhd8ed1ab_0
+  - watchdog=0.10.4=py38h578d9bd_0
+  - werkzeug=2.1.1=pyhd8ed1ab_0
+  - wheel=0.37.1=pyhd8ed1ab_0
+  - x264=1!161.3030=h7f98852_1
+  - xorg-kbproto=1.0.7=h7f98852_1002
+  - xorg-libice=1.0.10=h7f98852_0
+  - xorg-libsm=1.2.3=hd9c2040_1000
+  - xorg-libx11=1.7.2=h7f98852_0
+  - xorg-libxau=1.0.9=h7f98852_0
+  - xorg-libxdmcp=1.1.3=h7f98852_0
+  - xorg-libxext=1.3.4=h7f98852_1
+  - xorg-libxrender=0.9.10=h7f98852_1003
+  - xorg-renderproto=0.11.1=h7f98852_1002
+  - xorg-xextproto=7.3.0=h7f98852_1002
+  - xorg-xproto=7.0.31=h7f98852_1007
+  - xz=5.2.5=h516909a_1
+  - yaml=0.2.5=h7f98852_2
+  - yarl=1.7.2=py38h0a891b7_2
+  - zarr=2.11.3=pyhd8ed1ab_0
+  - zfp=0.5.5=h9c3ff4c_8
+  - zipp=3.8.0=pyhd8ed1ab_0
+  - zlib=1.2.11=h166bdaf_1014
+  - zstd=1.5.2=ha95c52a_0
diff --git a/environments/dataset_environment.py b/environments/dataset_environment.py
new file mode 100644
index 0000000..3e04cd9
--- /dev/null
+++ b/environments/dataset_environment.py
@@ -0,0 +1,96 @@
+from typing import Tuple, Optional
+
+import numpy as np
+from hydra.utils import to_absolute_path
+
+from common.sample_util import GridCoordTransformer, get_nd_index_volume
+from common.zarr_util import open_cached
+
+class DatasetEnvironment:
+    def __init__(self, 
+            zarr_path: str,
+            name: str,
+            rope_id: Tuple[int, int], 
+            transformer: GridCoordTransformer,
+            random_init: bool=False, 
+            seed: Optional[bool]=None,
+            cache_size: str='10GB',
+            raw_data_name: str=None):
+        rope_id = tuple(rope_id)
+        
+        zarr_root = open_cached(
+            to_absolute_path(zarr_path), 'r', 
+            cache_size=cache_size)
+        data_array = zarr_root[name]
+        raw_data_array = None
+        if raw_data_name is not None:
+            raw_data_array = zarr_root[raw_data_name]
+        nn_valid_action = zarr_root['fill_invalid/nn_valid_action'][rope_id]
+        nn_dist = zarr_root['fill_invalid/nn_dist'][rope_id]
+
+        self.data_array = data_array
+        self.raw_data_array = raw_data_array
+        self.nn_valid_action = nn_valid_action
+        self.nn_dist = nn_dist
+        self.rope_id = rope_id
+        self.goal_pix = None
+        self.random_init = random_init
+        self.rs = np.random.RandomState(seed=seed)
+        self.transformer = transformer
+    
+    def set_goal(self, goal: Tuple[float, float]):
+        self.goal_pix = tuple(self.transformer.to_grid([goal], clip=True)[0])
+
+    def set_goal_pix(self, goal_pix: Tuple[int, int]):
+        self.goal_pix = tuple(goal_pix)
+    
+    def step(self, action: np.ndarray
+            ) -> Tuple[np.ndarray, float, bool, dict]:
+        if self.goal_pix is None:
+            raise RuntimeError('Please call set_goal before step.')
+
+        eps = 1e-7
+        action = np.clip(action, 0, 1-eps)
+        action_scale = np.array(
+            self.nn_dist.shape, 
+            dtype=action.dtype)
+        raw_action_coord = tuple((action * action_scale
+            ).astype(np.int64).tolist())
+        # avoid potential invalid cell
+        action_nn_dist = self.nn_dist[raw_action_coord]
+        action_coord = tuple(self.nn_valid_action[raw_action_coord].tolist())
+
+        n_inits = self.data_array.shape[-3]
+        init_id = n_inits // 2
+        if self.random_init:
+            init_id = self.rs.choice(init_id, size=1)[0]
+
+        coord = self.rope_id + action_coord + (init_id,)
+        traj_img = self.data_array[coord]
+
+        # compute error
+        img_coords = get_nd_index_volume(traj_img.shape)
+        traj_coords = img_coords[traj_img]
+        dists_pix = np.linalg.norm(traj_coords - self.goal_pix, axis=-1)
+        dist_pix = np.min(dists_pix)
+        dist_m = dist_pix / self.transformer.pix_per_m
+
+        observation = traj_img
+        loss = dist_m
+        done = False
+        info = {
+            'action': np.array(action_coord) / action_scale,
+            'action_coord': action_coord,
+            'action_nn_dist': action_nn_dist,
+            'init_id': init_id
+        }
+        # add trajectory
+        if self.raw_data_array is not None:
+            data = self.raw_data_array[coord]
+            has_data = np.all(np.isfinite(data),axis=-1)
+            trajectory = data[has_data].astype(np.float32)
+            trajectory_pix = self.transformer.to_grid(trajectory[:,[0,2]])
+            info['trajectory'] = trajectory
+            info['trajectory_pix'] = trajectory_pix
+
+        return observation, loss, done, info
diff --git a/environments/goal_selection.py b/environments/goal_selection.py
new file mode 100644
index 0000000..edb7e00
--- /dev/null
+++ b/environments/goal_selection.py
@@ -0,0 +1,39 @@
+import numpy as np
+import zarr
+from hydra.utils import to_absolute_path
+from common.sample_util import get_nd_index_volume
+
+def select_rope_and_goals(
+        zarr_path, n_ropes, n_goals,
+        mask_names=('split/is_test',),
+        seed=0):
+    root = zarr.open(to_absolute_path(zarr_path), 'r')
+    assert(len(mask_names) > 0)
+    mask = None
+    for name in mask_names:
+        this_mask = root[name][:]
+        if mask is None:
+            mask = this_mask
+        else:
+            mask = mask & this_mask
+
+    rope_idx_volume = get_nd_index_volume(mask.shape)
+    test_rope_idxs = rope_idx_volume[mask]
+
+    rs = np.random.RandomState(seed=seed)
+    rope_ids = test_rope_idxs[
+        rs.choice(len(test_rope_idxs), 
+        size=n_ropes, replace=False)]
+    # select goals
+    max_hitrate_array = root['control/max_hitrate']
+    img_shape = max_hitrate_array.shape[-2:]
+    goal_coord_img = get_nd_index_volume(img_shape)
+    rope_goal_dict = dict()
+    for i in range(len(rope_ids)):
+        rope_id = rope_ids[i]
+        this_hitrate_img = max_hitrate_array[tuple(rope_id)]
+        valid_goal_mask = this_hitrate_img > 0.95
+        valid_goals = goal_coord_img[valid_goal_mask]
+        rope_goal_dict[tuple(rope_id.tolist())] = valid_goals[
+            rs.choice(len(valid_goals), n_goals, replace=False)]
+    return rope_goal_dict
diff --git a/environments/sim_environment.py b/environments/sim_environment.py
new file mode 100644
index 0000000..1d5d878
--- /dev/null
+++ b/environments/sim_environment.py
@@ -0,0 +1,175 @@
+from typing import Tuple, Optional
+
+import numpy as np
+from scipy.interpolate import interp1d
+from omegaconf import DictConfig, OmegaConf
+from hydra.utils import to_absolute_path
+
+from abr_control.arms.mujoco_config import MujocoConfig
+from abr_control.controllers import Joint
+from abr_control.interfaces.mujoco import Mujoco
+
+from abr_control_mod.mujoco_utils import (
+    get_rope_body_ids, get_body_center_of_mass,
+    apply_impulse_com_batch, get_mujoco_state, set_mujoco_state)
+from common.urscript_control_util import get_movej_trajectory
+from common.template_util import require_xml
+from common.cv_util import get_traj_occupancy
+from common.sample_util import (GridCoordTransformer, get_nd_index_volume)
+
+
+def deg_to_rad(deg):
+    return deg / 180 * np.pi
+
+def get_param_dict(length, density, 
+        width=0.02, stick_length=0.48, 
+        num_nodes=25, **kwargs):
+    stiffness = 0.01 / 0.015 * density
+    damping = 0.005 / 0.015 * density
+
+    link_size = length / (num_nodes - 1) / 2
+    param_dict = {
+        'count': num_nodes,
+        'spacing': link_size * 2,
+        'link_size': link_size,
+        'link_width': width / 2,
+        'link_mass': density * length / num_nodes,
+        'stiffness': stiffness,
+        'damping': damping,
+        'stick_size': (stick_length - link_size) / 2,
+        'ee_offset': stick_length
+    }
+    return param_dict
+
+class SimEnvironment:
+    def __init__(self, env_cfg: DictConfig, rope_cfg: DictConfig):
+        self.env_cfg = env_cfg
+
+        # create transformer
+        transformer = GridCoordTransformer(**env_cfg.transformer)
+
+        # build simulation xml
+        xml_dir = to_absolute_path(env_cfg.xml_dir)
+        rope_param_dict = get_param_dict(**rope_cfg)
+        xml_fname = require_xml(
+            xml_dir, rope_param_dict, 
+            to_absolute_path(env_cfg.template_path), force=True)
+        
+        # load mujoco environment
+        robot_config = MujocoConfig(
+            xml_file=xml_fname, 
+            folder=xml_dir)
+        interface = Mujoco(robot_config, 
+            dt=env_cfg.sim.dt, 
+            visualize=env_cfg.sim.visualize)
+        interface.connect()
+        ctrlr = Joint(robot_config, kp=env_cfg.sim.kp)
+
+        j_init = deg_to_rad(np.array(env_cfg.sim.j_init_deg))
+        interface.set_joint_state(q=j_init, dq=np.zeros(6))
+        init_state = get_mujoco_state(interface.sim)
+        rope_body_ids = get_rope_body_ids(interface.sim.model)
+
+        self.transformer = transformer
+        self.interface = interface
+        self.ctrlr = ctrlr
+        self.init_state = init_state
+        self.rope_body_ids = rope_body_ids
+        self.rs = np.random.RandomState(env_cfg.seed)
+
+    def set_goal(self, goal: Tuple[float, float]):
+        self.goal_pix = tuple(self.transformer.to_grid([goal], clip=True)[0])
+
+    def set_goal_pix(self, goal_pix: Tuple[int, int]):
+        self.goal_pix = tuple(goal_pix)
+
+    def step(self, action: np.ndarray
+            ) -> Tuple[np.ndarray, float, bool, dict]:
+        interface = self.interface
+        rope_body_ids = self.rope_body_ids
+        ctrlr = self.ctrlr
+        init_state = self.init_state
+
+        if self.goal_pix is None:
+            raise RuntimeError('Please call set_goal before step.')
+
+        eps = 1e-7
+        action = np.clip(action, 0, 1-eps)
+        # compute action
+        ac = self.env_cfg.action
+        speed_interp = interp1d([0,1], ac.speed_range)
+        j2_interp = interp1d([0,1], ac.j2_delta_range)
+        j3_interp = interp1d([0,1], ac.j3_delta_range)
+        speed = speed_interp(action[0])
+        j2_delta = j2_interp(action[1])
+        j3_delta = j3_interp(action[2])
+        impulse = 0
+        if self.env_cfg.random_init:
+            impulse = self.rs.uniform(*ac.impulse_range)
+
+        # generate target
+        sc = self.env_cfg.sim
+        j_init = deg_to_rad(np.array(sc.j_init_deg))
+        j_start = j_init
+        j_end = j_init.copy()
+        j_end[2] += j2_delta
+        j_end[3] += j3_delta
+
+        q_target = get_movej_trajectory(
+            j_start=j_start, j_end=j_end, 
+            acceleration=ac.acceleration, speed=speed, dt=sc.dt)
+        qdot_target = np.gradient(q_target, sc.dt, axis=0)
+
+        # run simulation
+        set_mujoco_state(interface.sim, init_state)
+        
+        impulses = np.multiply.outer(
+            np.linspace(0, 1, len(rope_body_ids)),
+            np.array([impulse, 0, 0]))
+        apply_impulse_com_batch(
+            sim=interface.sim, 
+            body_ids=rope_body_ids, 
+            impulses=impulses)
+
+        num_sim_steps = int(sc.sim_duration / sc.dt)
+        rope_history = list()
+        n_contact_buffer = [0]
+        for i in range(num_sim_steps):
+            feedback = interface.get_feedback()
+
+            idx = min(i, len(q_target)-1)
+            u = ctrlr.generate(
+                q=feedback['q'],
+                dq=feedback['dq'],
+                target=q_target[idx],
+                target_velocity=qdot_target[idx]
+            )
+            n_contact = interface.sim.data.ncon
+            n_contact_buffer.append(n_contact)
+            if i % sc.subsample_rate == 0:
+                nc = max(n_contact_buffer)
+                if nc > 0:
+                    break
+                rope_body_com = get_body_center_of_mass(
+                    interface.sim.data, rope_body_ids)
+                rope_history.append(rope_body_com[-1])
+                n_contact_buffer = [0]
+            interface.send_forces(u)
+
+        this_data = np.array(rope_history, dtype=np.float32)
+        traj_img = get_traj_occupancy(this_data[:,[0,2]], self.transformer)
+
+        img_coords = get_nd_index_volume(traj_img.shape)
+        traj_coords = img_coords[traj_img]
+        dists_pix = np.linalg.norm(traj_coords - self.goal_pix, axis=-1)
+        dist_pix = np.min(dists_pix)
+        dist_m = dist_pix / self.transformer.pix_per_m
+
+        # return
+        observation = traj_img
+        loss = dist_m
+        done = False
+        info = {
+            'action': action
+        }
+        return observation, loss, done, info
diff --git a/environments/ur5_whip_environment.py b/environments/ur5_whip_environment.py
new file mode 100644
index 0000000..b96333e
--- /dev/null
+++ b/environments/ur5_whip_environment.py
@@ -0,0 +1,148 @@
+from typing import Dict, Tuple, Optional
+import time
+from threading import Thread
+import pickle
+from hydra.utils import to_absolute_path
+
+import torch
+import numpy as np
+from omegaconf import DictConfig, OmegaConf
+from matplotlib import cm
+import cv2 as cv
+from scipy.interpolate import interp1d
+
+from real_ur5.zed_camera import ZedCamera
+from real_ur5.swing_actor import SwingActor
+from real_ur5.zed_ram_recorder import ZedRamRecorder
+from real_ur5.trajectory_projector import (
+    TrajectoryProjector, GridCoordTransformer)
+from networks.keypoint_deeplab import KeypointDeeplab
+from components.tracking import KeypointTracker
+from real_ur5.buffered_tracker import BufferedTracker
+from real_ur5.delta_action_sampler import get_distance
+
+
+class Ur5WhipEnvironment:
+    def __init__(self, env_cfg: DictConfig, camera: ZedCamera):
+        self.env_cfg = env_cfg
+        # init action
+        self.actor = SwingActor(**env_cfg.action.swing_actor)
+        # init tracking
+        vc = env_cfg.vision
+        self.transformer = GridCoordTransformer.from_zarr(
+            to_absolute_path(vc.transformer.zarr_path))
+        
+        cal_data = pickle.load(open(
+            to_absolute_path(vc.projector.calib_path), 'rb'))
+        tx_img_robot = cal_data['tx_img_robot']
+        self.projector = TrajectoryProjector(
+            tx_img_robot=tx_img_robot,
+            transformer=self.transformer,
+            **vc.projector)
+
+        tc = vc.tracker
+        device = torch.device('cuda', tc.gpu_id)
+        model = KeypointDeeplab.load_from_checkpoint(
+            to_absolute_path(tc.ckpt_path))
+        tracker = KeypointTracker(model)
+        tracker.to(device=device, dtype=torch.float16)
+        self.tracker = tracker
+        self.device = device
+
+        self.camera = camera
+        self.goal_pix = None
+
+    def __enter__(self):
+        self.actor.__enter__()
+        return self
+    
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        self.actor.__exit__(exc_type, exc_value, exc_traceback)
+        
+    def set_goal(self, goal: Tuple[float, float]):
+        self.goal_pix = tuple(self.transformer.to_grid([goal], clip=True)[0])
+
+    def set_goal_pix(self, goal_pix: Tuple[int, int]):
+        self.goal_pix = tuple(goal_pix)
+    
+    def reset(self):
+        self.actor.reset(blocking=True)
+        time.sleep(0.5)
+    
+    def step(self, action: np.ndarray
+            ) -> Tuple[np.ndarray, float, bool, dict]:
+        if self.goal_pix is None:
+            raise RuntimeError('Please call set_goal before step.')
+
+        # prep
+        eps = 1e-7
+        action = np.clip(action, 0, 1-eps)
+        vc = self.env_cfg.vision
+        recorder = ZedRamRecorder(self.camera,
+            **vc.ram_recorder)
+        buff_tracker = BufferedTracker(
+            recorder=recorder,
+            tracker=self.tracker,
+            **vc.buff_tracker)
+        
+        # wait
+        if 'pre_wait' in self.env_cfg.action:
+            time.sleep(self.env_cfg.action.pre_wait)
+
+        # action
+        ac = self.env_cfg.action
+        buff_tracker.start()
+        self.actor.swing(*action.tolist(), blocking=True)
+        time.sleep(ac.action_duration)
+        buff_tracker.stop()
+
+        # async reset
+        def reset_func(actor, action, duration, speed):
+            actor.swing(*action, blocking=True)
+            time.sleep(duration)
+            actor.reset(speed=speed, blocking=True)
+        reset_t = Thread(target=reset_func, 
+            args=(self.actor,),
+            kwargs=ac.reset)
+        reset_t.start()
+        
+        # get tracking
+        tracking_result = buff_tracker.get_tracking()
+        traj_img = self.projector.get_sim_traj_img(tracking_result)
+
+        # compute loss
+        dist_to_goal_pix = get_distance(traj_img, self.goal_pix)[0]
+        pix_per_m = self.transformer.pix_per_m
+        dist_to_goal_m = dist_to_goal_pix / pix_per_m
+
+        # compute trajectory
+        keypoints = tracking_result['keypoints'][:,0,:]
+        confidence = tracking_result['confidence'][:,0]
+        mask = confidence > self.projector.confidence_threshold
+        # if mask.sum() < 10:
+        #     raise RuntimeError("Invalid tracking")
+        raw_x = np.linspace(0,1,len(keypoints))
+        valid_keypoints = keypoints[mask]
+        valid_x = raw_x[mask]
+
+        def nearest_interp1d(x, y, **kwargs):
+            return interp1d(x, y, bounds_error=False, fill_value=(y[0],y[-1]),**kwargs)
+        raw_traj = nearest_interp1d(valid_x, valid_keypoints, axis=0)(raw_x)
+        robot_traj = self.projector.to_robot_frame(raw_traj)
+        if self.projector.flip_x:
+            robot_traj[:,0] *= -1
+        pix_traj = self.projector.transformer.to_grid(robot_traj)
+
+        observation = traj_img
+        loss = dist_to_goal_m
+        done = False
+        info = {
+            'goal_pix': self.goal_pix,
+            'action': action,
+            'images': np.concatenate(buff_tracker.imgs_list, axis=0),
+            'tracking_result': tracking_result,
+            'trajectory': robot_traj,
+            'trajectory_pix': pix_traj
+        }
+        torch.cuda.synchronize(self.device)
+        return observation, loss, done, info
diff --git a/eval_irp_pred_accuracy.py b/eval_irp_pred_accuracy.py
new file mode 100644
index 0000000..8a96915
--- /dev/null
+++ b/eval_irp_pred_accuracy.py
@@ -0,0 +1,87 @@
+# %%
+# import
+import os
+import pathlib
+import collections
+import json
+
+import numpy as np
+from scipy.spatial.ckdtree import cKDTree
+
+import yaml
+import hydra
+from hydra.utils import to_absolute_path
+from omegaconf import DictConfig, OmegaConf
+import pytorch_lightning as pl
+import torch
+import wandb
+from tqdm import tqdm
+
+from networks.delta_trajectory_deeplab import DeltaTrajectoryDeeplab
+from datasets.delta_trajectory_gaussian_dataset import DeltaTrajectoryGaussianDataModule
+from common.sample_util import GridCoordTransformer
+
+# %%
+@hydra.main(config_path="config", config_name=pathlib.Path(__file__).stem)
+def main(cfg: DictConfig) -> None:
+    if cfg.wandb.project is None:
+        cfg.wandb.project = os.path.basename(__file__)
+    wandb.init(**cfg.wandb)
+    config = OmegaConf.to_container(cfg, resolve=True)
+    output_dir = os.getcwd()
+    config['output_dir'] = output_dir
+    yaml.dump(config, open('config.yaml', 'w'), default_flow_style=False)
+    wandb.config.update(config)
+
+    datamodule = DeltaTrajectoryGaussianDataModule(**cfg.datamodule)
+    datamodule.prepare_data()
+    test_dataloader = datamodule.test_dataloader()
+    transformer = GridCoordTransformer.from_zarr(
+        to_absolute_path(cfg.datamodule.zarr_path))
+
+    device = torch.device('cuda', cfg.model.gpu_id)
+    model = DeltaTrajectoryDeeplab.load_from_checkpoint(
+        to_absolute_path(cfg.model.ckpt_path))
+    model = model.eval().to(device)
+
+    coord_img = np.moveaxis(np.indices((256,256)),0,-1)
+    coord_img = transformer.from_grid(coord_img)
+    raw_metrics_dict = collections.defaultdict(list)
+    with torch.no_grad():
+        for batch in tqdm(test_dataloader):
+            input_trajectory = batch['input_trajectory'].to(device)
+            action_delta = batch['action_delta'].to(device)
+            logits = model.forward(input_trajectory, action_delta)
+            probs = torch.sigmoid(logits)
+
+            gt_imgs = batch['target_trajectory'].numpy()
+            pred_imgs = probs.detach().to('cpu').numpy()
+
+            for i in range(gt_imgs.shape[0]):
+                gt_points = coord_img[gt_imgs[i,0] > 0.5]
+                pred_points = coord_img[pred_imgs[i,0] > cfg.model.threshold]
+                gt_tree = cKDTree(gt_points)
+                pred_tree = cKDTree(pred_points)
+
+                gt_dists, _ = pred_tree.query(gt_points)
+                pred_dists, _ = gt_tree.query(pred_points)
+
+                metric = {
+                    'gt_chamfer': np.mean(gt_dists),
+                    'pred_chamfer': np.mean(pred_dists)
+                }
+                for key, value in metric.items():
+                    raw_metrics_dict[key].append(value)
+                    wandb.log(metric)
+    
+    # aggregate
+    agg_metrics_dict = dict()
+    for key, value in raw_metrics_dict.items():
+        agg_metrics_dict[key] = np.mean(value)
+
+    # save data
+    json.dump(agg_metrics_dict, open('metrics.json','w'), indent=4)
+
+# %%
+if __name__ == '__main__':
+    main()
diff --git a/eval_irp_rope_dataset.py b/eval_irp_rope_dataset.py
new file mode 100644
index 0000000..72f48ab
--- /dev/null
+++ b/eval_irp_rope_dataset.py
@@ -0,0 +1,134 @@
+# %%
+# import
+import os
+import pathlib
+import yaml
+import hydra
+from hydra.utils import to_absolute_path
+from omegaconf import DictConfig, OmegaConf
+import zarr
+import numpy as np
+from tqdm import tqdm
+import torch
+import wandb
+from scipy.interpolate import interp1d
+
+from common.sample_util import GridCoordTransformer, transpose_data_dict
+from environments.dataset_environment import DatasetEnvironment
+from environments.goal_selection import select_rope_and_goals
+from common.wandb_util import get_error_plots_log
+
+
+from networks.delta_trajectory_deeplab import DeltaTrajectoryDeeplab
+from real_ur5.delta_action_sampler import DeltaActionGaussianSampler
+from real_ur5.delta_action_selector import DeltaActionSelector
+
+
+# %%
+@hydra.main(config_path="config", config_name=pathlib.Path(__file__).stem)
+def main(cfg: DictConfig) -> None:
+    if not cfg.offline:
+        wandb.init(**cfg.wandb)
+    abs_zarr_path = to_absolute_path(cfg.setup.zarr_path)
+    rope_goal_dict = select_rope_and_goals(
+        zarr_path=abs_zarr_path, 
+        **cfg.setup.selection)
+    config = OmegaConf.to_container(cfg, resolve=True)
+    output_dir = os.getcwd()
+    config['output_dir'] = output_dir
+    yaml.dump(config, open('config.yaml', 'w'), default_flow_style=False)
+    if not cfg.offline:
+        wandb.config.update(config)
+
+    root = zarr.open(abs_zarr_path, 'r')
+    init_action_array = root['train_rope/best_action_coord']
+    action_scale = np.array(root[cfg.setup.name].shape[2:5])
+
+    # load action model
+    device = torch.device('cuda', cfg.action.gpu_id)
+    dtype = torch.float16 if cfg.action.use_fp16 else torch.float32
+    sampler = DeltaActionGaussianSampler(**cfg.action.sampler)
+    action_model = DeltaTrajectoryDeeplab.load_from_checkpoint(
+        to_absolute_path(cfg.action.ckpt_path))
+    action_model_gpu = action_model.to(
+        device, dtype=dtype).eval()
+    selector = DeltaActionSelector(
+        model=action_model_gpu, **cfg.action.selector)
+
+    transformer = GridCoordTransformer.from_zarr(abs_zarr_path)
+    # load action model
+    ropes_log = list()
+    for rope_id, goals in rope_goal_dict.items():
+        env = DatasetEnvironment(
+            zarr_path=abs_zarr_path,
+            name=cfg.setup.name,
+            rope_id=rope_id,
+            transformer=transformer,
+            random_init=cfg.setup.random_init)
+        goals_log = list()
+        for goal_pix in tqdm(goals):
+            env.set_goal_pix(goal_pix)
+            # experiment
+            init_action = init_action_array[tuple(goal_pix)] / action_scale
+            action = init_action
+            # action = np.random.uniform(0,1,3)
+            steps_log = list()
+            # obs_log = list()
+            for step_id in range(cfg.setup.n_steps):
+                observation, dist_to_goal_m, _, info = env.step(
+                    action)
+                # action = info['action']
+                sigma = cfg.action.constant_sigma
+                if sigma is None:
+                    sigma = min(dist_to_goal_m * cfg.action.gain, cfg.action.sigma_max)
+                
+                ts = cfg.action.threshold
+                threshold_interp = interp1d(
+                    x=[0, ts.dist_max], y=[ts.max, ts.min], 
+                    kind='linear',
+                    bounds_error=False,
+                    fill_value=(ts.max, ts.min))
+                threshold = threshold_interp(min(dist_to_goal_m, ts.dist_max))
+                
+                delta_action_samples = sampler.get_delta_action_samples(
+                    action, sigma=sigma)
+                selection_result = selector.get_delta_action(
+                    traj_img=observation, goal_pix=goal_pix, 
+                    delta_action_samples=delta_action_samples,
+                    threshold=threshold)
+                best_delta_action = selection_result['best_delta_action']
+
+                # logging
+                steps_log.append({
+                    'error': dist_to_goal_m,
+                    'action': action
+                })
+                # next
+                action = best_delta_action + action
+                # action = np.random.uniform(0,1,3)
+                # obs_log.append(observation)
+            
+            # aggregate
+            steps_log = transpose_data_dict(steps_log)
+            goals_log.append(steps_log)
+        
+        # aggregate data
+        rope_log = transpose_data_dict(goals_log)
+        rope_key = 'rope_' + '_'.join(str(x) for x in rope_id)
+        log = get_error_plots_log(rope_key, rope_log['error'])
+        if not cfg.offline:
+            wandb.log(log)
+        ropes_log.append(rope_log)
+    all_logs = transpose_data_dict(ropes_log)
+    errors = all_logs['error'].reshape(-1, all_logs['error'].shape[-1])
+    log = get_error_plots_log('all', errors)
+    if not cfg.offline:
+        wandb.log(log)
+    import pickle
+    pickle.dump(all_logs, open('log.pkl', 'wb'))
+
+# %%
+if __name__ == '__main__':
+    main()
+
+# %%
diff --git a/eval_irp_rope_real.py b/eval_irp_rope_real.py
new file mode 100644
index 0000000..be9ac91
--- /dev/null
+++ b/eval_irp_rope_real.py
@@ -0,0 +1,234 @@
+# %%
+# import
+import os
+import pathlib
+import yaml
+import hydra
+from hydra.utils import to_absolute_path
+from omegaconf import DictConfig, OmegaConf
+import zarr
+import numpy as np
+from tqdm import tqdm
+import torch
+import wandb
+from scipy.interpolate import interp1d
+import cv2 as cv
+from matplotlib import cm
+import pickle
+import skvideo.io
+from threading import Thread
+import shutil
+
+from real_ur5.zed_camera import ZedCamera
+from environments.ur5_whip_environment import Ur5WhipEnvironment
+from common.sample_util import transpose_data_dict
+from common.wandb_util import get_error_plots_log
+
+from networks.delta_trajectory_deeplab import DeltaTrajectoryDeeplab
+from real_ur5.delta_action_sampler import DeltaActionGaussianSampler
+from real_ur5.delta_action_selector import DeltaActionSelector
+
+# %%
+def vis_tracking_goal(images, keypoints, confidence, goal_pix, 
+        projector, threshold=0.3):
+    vis_img = images.min(axis=0)[:,:,[2,1,0]].copy()
+    mask = confidence > threshold
+    valid_keypoints = keypoints[mask]
+    invalid_keypoints = keypoints[~mask]
+    for kp in invalid_keypoints:
+        cv.drawMarker(vis_img, tuple(kp), (0,0,255), 
+            markerType=cv.MARKER_TILTED_CROSS, markerSize=10, thickness=1)
+    cv.polylines(vis_img, [valid_keypoints], False, color=(0,255,0))
+    goal_kp = projector.grid_to_image([goal_pix])[0].astype(np.int32)
+    cv.drawMarker(vis_img, goal_kp, 
+            color=(255,0,0), markerType=cv.MARKER_CROSS,
+            markerSize=50, thickness=3)
+    return vis_img
+
+
+def vis_action(best_prob, best_mask, observation, goal_pix):
+    def draw(input, observation, goal_pix):
+        cmap = cm.get_cmap('viridis')
+        left_img = cmap(input)[:,:,:3].copy()
+        if observation is not None:
+            left_img[observation] = (np.array([1,0,0]) + left_img[observation]) / 2
+        cv.drawMarker(left_img, goal_pix[::-1], 
+                color=(1,0,0), markerType=cv.MARKER_CROSS,
+                markerSize=20, thickness=1)
+        return np.moveaxis(left_img,0,1)[::-1,::-1]
+
+    left_img = draw(best_prob, None, goal_pix)
+    right_img = draw(best_mask.astype(np.float32), observation, goal_pix)
+    img = np.concatenate([left_img, right_img], axis=1)
+    return img
+
+
+def save_video(out_fname, images):
+    pathlib.Path(out_fname).resolve().parent.mkdir(parents=True, exist_ok=True)
+    writer = skvideo.io.FFmpegWriter(
+        out_fname, 
+        inputdict={
+            '-r': '60'
+        },
+        outputdict={
+            '-r': '60',
+            '-c:v': 'libx264',
+            '-crf': '18',
+            '-preset': 'medium',
+            '-pix_fmt': 'yuv420p'
+            # '-profile:v': 'high'
+        })
+    for img in images:
+        writer.writeFrame(img[:,:,[2,1,0]])
+    writer.close()
+
+
+# %%
+@hydra.main(config_path="config", config_name=pathlib.Path(__file__).stem)
+def main(cfg: DictConfig) -> None:
+    output_dir= os.getcwd()
+    print(output_dir)
+    if not cfg.offline:
+        wandb.init(**cfg.wandb)
+    config = OmegaConf.to_container(cfg, resolve=True)
+    config['output_dir'] = output_dir
+    yaml.dump(config, open('config.yaml', 'w'), default_flow_style=False)
+    if not cfg.offline:
+        wandb.config.update(config)
+    shutil.copyfile(
+        to_absolute_path(cfg.env.vision.projector.calib_path), 
+        'calibration.pkl')
+
+    # load setup
+    goal_pixs = list(cfg.setup.goal_pixs)
+
+    # load enviroment
+    with ZedCamera() as camera:
+        env = Ur5WhipEnvironment(env_cfg=cfg.env, camera=camera)
+        # load action model
+        device = torch.device('cuda', cfg.action.gpu_id)
+        dtype = torch.float16 if cfg.action.use_fp16 else torch.float32
+        sampler = DeltaActionGaussianSampler(**cfg.action.sampler)
+        action_model = DeltaTrajectoryDeeplab.load_from_checkpoint(
+            to_absolute_path(cfg.action.ckpt_path))
+        action_model_gpu = action_model.to(
+            device, dtype=dtype).eval()
+        selector = DeltaActionSelector(
+            model=action_model_gpu, **cfg.action.selector)
+
+        # load init action
+        ic = cfg.action.init_action
+        root = zarr.open(to_absolute_path(ic.zarr_path), 'r')
+        init_action_arr = root[ic.name][:] / ic.action_scale
+        video_threads = list()
+        with env:
+            env.reset()
+            goals_log = list()
+            for goal_pix in tqdm(goal_pixs):
+                env.set_goal_pix(goal_pix)
+                # experiment
+                init_action = init_action_arr[tuple(goal_pix)]
+                const_action = ic.get('const_action', None)
+                if const_action is not None:
+                    init_action = np.array(const_action)
+                action = init_action
+                steps_log = list()
+                # obs_log = list()
+                min_dist = float('inf')
+                for step_id in tqdm(range(cfg.setup.n_steps)):
+                    print('action', action)
+                    observation, dist_to_goal_m, _, info = env.step(
+                        action)
+
+                    # log video
+                    if cfg.setup.save_video:
+                        fname = 'videos/' + '_'.join(str(x) for x in goal_pix) + '_' + str(step_id) + '.mp4'
+                        vt = Thread(target=save_video, args=(fname, info['images']))
+                        vt.start()
+                        video_threads.append(vt)
+
+                    min_dist = min(min_dist, dist_to_goal_m)
+                    # action = info['action']
+                    sigma = cfg.action.constant_sigma
+                    if sigma is None:
+                        sigma = min(dist_to_goal_m * cfg.action.gain, cfg.action.sigma_max)
+                    
+                    ts = cfg.action.threshold
+                    threshold_interp = interp1d(
+                        x=[0, ts.dist_max], y=[ts.max, ts.min], 
+                        kind='linear',
+                        bounds_error=False,
+                        fill_value=(ts.max, ts.min))
+                    threshold = threshold_interp(min(dist_to_goal_m, ts.dist_max))
+                    
+                    delta_action_samples = sampler.get_delta_action_samples(
+                        action, sigma=sigma)
+                    selection_result = selector.get_delta_action(
+                        traj_img=observation, goal_pix=goal_pix, 
+                        delta_action_samples=delta_action_samples,
+                        threshold=threshold)
+                    best_delta_action = selection_result['best_delta_action']
+
+                    # logging
+                    # vis trajectory and goal
+                    images = info['images']
+                    tracking_result = info['tracking_result']
+                    projector = env.projector
+                    tracking_vis = vis_tracking_goal(
+                        images, tracking_result['keypoints'],
+                        confidence=tracking_result['confidence'],
+                        projector=projector,
+                        goal_pix=goal_pix,
+                        threshold=cfg.env.vision.projector.confidence_threshold
+                    )
+                    # vis action
+                    best_prob = selection_result['best_prob']
+                    best_mask = selection_result['best_mask']
+                    action_vis = vis_action(best_prob, best_mask, observation, goal_pix)
+                    
+                    log = {
+                        'tracking_vis': tracking_vis,
+                        'action_vis': action_vis,
+                        'error': dist_to_goal_m,
+                        'k_min_error': min_dist,
+                        'step_id': step_id
+                    }
+                    other_log = {
+                        'action': action,
+                        'observation': observation,
+                        'keypoints': tracking_result['keypoints'],
+                        'confidence': tracking_result['confidence'],
+                        'goal_pix': np.array(goal_pix),
+                        'sigma': sigma,
+                        'threshold': threshold
+                    }
+                    if not cfg.offline:
+                        wandb_log = dict()
+                        this_key = 'goal_'+'_'.join(str(x) for x in goal_pix)
+                        for key, value in log.items():
+                            if key.endswith('vis'):
+                                value = wandb.Image(value)
+                            elif key.endswith('error'):
+                                value = value * 100
+                            wandb_log[this_key + '/' + key] = value
+                        wandb.log(wandb_log)
+                    log.update(other_log)
+                    steps_log.append(log)
+                    # next
+                    action = best_delta_action + action
+                # aggregate
+                steps_log = transpose_data_dict(steps_log)
+                goals_log.append(steps_log)
+            rope_log = transpose_data_dict(goals_log)
+            pickle.dump(rope_log, open('rope_log.pkl', 'wb'))
+            if not cfg.offline:
+                log = get_error_plots_log('all', rope_log['error'])
+                wandb.log(log)
+        for vt in video_threads:
+            vt.join()
+
+# %%
+if __name__ == '__main__':
+    main()
+
+# %%
diff --git a/eval_irp_rope_sim.py b/eval_irp_rope_sim.py
new file mode 100644
index 0000000..c59ea7f
--- /dev/null
+++ b/eval_irp_rope_sim.py
@@ -0,0 +1,135 @@
+# %%
+# import
+import os
+import pathlib
+import yaml
+import hydra
+from hydra.utils import to_absolute_path
+from omegaconf import DictConfig, OmegaConf
+import zarr
+import numpy as np
+from tqdm import tqdm
+import torch
+import wandb
+from scipy.interpolate import interp1d
+
+from common.sample_util import VirtualSampleGrid, transpose_data_dict
+from environments.sim_environment import SimEnvironment
+from environments.goal_selection import select_rope_and_goals
+from common.wandb_util import get_error_plots_log
+
+
+from networks.delta_trajectory_deeplab import DeltaTrajectoryDeeplab
+from real_ur5.delta_action_sampler import DeltaActionGaussianSampler
+from real_ur5.delta_action_selector import DeltaActionSelector
+
+
+# %%
+@hydra.main(config_path="config", config_name=pathlib.Path(__file__).stem)
+def main(cfg: DictConfig) -> None:
+    if not cfg.offline:
+        wandb.init(**cfg.wandb)
+    abs_zarr_path = to_absolute_path(cfg.setup.zarr_path)
+    rope_goal_dict = select_rope_and_goals(
+        zarr_path=abs_zarr_path, 
+        **cfg.setup.selection)
+    config = OmegaConf.to_container(cfg, resolve=True)
+    output_dir = os.getcwd()
+    config['output_dir'] = output_dir
+    yaml.dump(config, open('config.yaml', 'w'), default_flow_style=False)
+    if not cfg.offline:
+        wandb.config.update(config)
+
+    root = zarr.open(abs_zarr_path, 'r')
+    init_action_array = root['train_rope/best_action_coord']
+    action_scale = np.array(root[cfg.setup.name].shape[2:5])
+    sample_grid = VirtualSampleGrid.from_zarr_group(root)
+
+    # load action model
+    device = torch.device('cuda', cfg.action.gpu_id)
+    dtype = torch.float16 if cfg.action.use_fp16 else torch.float32
+    sampler = DeltaActionGaussianSampler(**cfg.action.sampler)
+    action_model = DeltaTrajectoryDeeplab.load_from_checkpoint(
+        to_absolute_path(cfg.action.ckpt_path))
+    action_model_gpu = action_model.to(
+        device, dtype=dtype).eval()
+    selector = DeltaActionSelector(
+        model=action_model_gpu, **cfg.action.selector)
+
+    # load action model
+    ropes_log = list()
+    for rope_id, goals in rope_goal_dict.items():
+        rope_config = {
+            'length': sample_grid.dim_samples[0][rope_id[0]],
+            'density': sample_grid.dim_samples[1][rope_id[1]]
+        }
+        env = SimEnvironment(
+            env_cfg=cfg.env,
+            rope_cfg=rope_config)
+        goals_log = list()
+        for goal_pix in tqdm(goals):
+            env.set_goal_pix(goal_pix)
+            # experiment
+            init_action = init_action_array[tuple(goal_pix)] / action_scale
+            action = init_action
+            # action = np.random.uniform(0,1,3)
+            steps_log = list()
+            # obs_log = list()
+            for step_id in range(cfg.setup.n_steps):
+                observation, dist_to_goal_m, _, info = env.step(
+                    action)
+                # action = info['action']
+                sigma = cfg.action.constant_sigma
+                if sigma is None:
+                    sigma = min(dist_to_goal_m * cfg.action.gain, cfg.action.sigma_max)
+                
+                ts = cfg.action.threshold
+                threshold_interp = interp1d(
+                    x=[0, ts.dist_max], y=[ts.max, ts.min], 
+                    kind='linear',
+                    bounds_error=False,
+                    fill_value=(ts.max, ts.min))
+                threshold = threshold_interp(min(dist_to_goal_m, ts.dist_max))
+                
+                delta_action_samples = sampler.get_delta_action_samples(
+                    action, sigma=sigma)
+                selection_result = selector.get_delta_action(
+                    traj_img=observation, goal_pix=goal_pix, 
+                    delta_action_samples=delta_action_samples,
+                    threshold=threshold)
+                best_delta_action = selection_result['best_delta_action']
+
+                # logging
+                steps_log.append({
+                    'error': dist_to_goal_m,
+                    'action': action
+                })
+                # next
+                action = best_delta_action + action
+                # action = np.random.uniform(0,1,3)
+                # obs_log.append(observation)
+            
+            # aggregate
+            steps_log = transpose_data_dict(steps_log)
+            goals_log.append(steps_log)
+        
+        # aggregate data
+        rope_log = transpose_data_dict(goals_log)
+        rope_key = 'rope_' + '_'.join(str(x) for x in rope_id)
+        log = get_error_plots_log(rope_key, rope_log['error'])
+        if not cfg.offline:
+            wandb.log(log)
+        ropes_log.append(rope_log)
+    all_logs = transpose_data_dict(ropes_log)
+    errors = all_logs['error'].reshape(-1, all_logs['error'].shape[-1])
+    log = get_error_plots_log('all', errors)
+    if not cfg.offline:
+        wandb.log(log)
+    import pickle
+    pickle.dump(all_logs, open('log.pkl', 'wb'))
+
+# %%
+if __name__ == '__main__':
+    main()
+
+# %%
diff --git a/images/calib_example.png b/images/calib_example.png
new file mode 100644
index 0000000..7c58bcc
Binary files /dev/null and b/images/calib_example.png differ
diff --git a/images/teaser.jpg b/images/teaser.jpg
new file mode 100644
index 0000000..97f0de5
Binary files /dev/null and b/images/teaser.jpg differ
diff --git a/networks/delta_trajectory_deeplab.py b/networks/delta_trajectory_deeplab.py
new file mode 100644
index 0000000..10155b1
--- /dev/null
+++ b/networks/delta_trajectory_deeplab.py
@@ -0,0 +1,70 @@
+import torch
+import torch.nn as nn
+import pytorch_lightning as pl
+
+from components.deeplab_v3_plus import DeepLabv3_feature, OutConv
+
+class DeltaTrajectoryDeeplab(pl.LightningModule):
+    def __init__(self,
+            action_sigma,
+            learning_rate=1e-3,
+            weight_decay=None,
+            loss='bce',
+            **kwargs):
+        super().__init__()
+        self.save_hyperparameters()
+
+        n_features = 64
+        self.net = DeepLabv3_feature(n_channels=4, n_features=n_features, os=8)
+        self.outc = OutConv(in_channels=n_features, out_channels=1)
+
+        self.criterion = None
+        if loss == 'bce':
+            self.criterion = nn.BCEWithLogitsLoss()
+        elif loss == 'mse':
+            self.criterion = nn.MSELoss()
+        else:
+            raise RuntimeError("Invalid loss type")
+
+        self.action_sigma = action_sigma
+        self.learning_rate = learning_rate
+        self.weight_decay = weight_decay
+
+    def configure_optimizers(self):
+        learning_rate = self.learning_rate
+        weight_decay = self.weight_decay
+        optimizer = None
+        if weight_decay is None:
+            optimizer = torch.optim.Adam(
+                self.parameters(), lr=learning_rate)
+        else:
+            optimizer = torch.optim.AdamW(
+                self.parameters(), lr=learning_rate, 
+                weight_decay=weight_decay)
+        return optimizer
+
+    def forward(self, input_trajectory, action_delta):
+        action_delta_ = action_delta / self.action_sigma
+        exploded_action_delta = action_delta_.reshape(
+            *action_delta_.shape,1,1).expand(
+                -1,-1,*input_trajectory.shape[2:])
+        x = torch.cat([input_trajectory, exploded_action_delta], dim=1)
+        features = self.net(x)
+        logits = self.outc(features)
+        return logits
+    
+    def step(self, batch, batch_idx, step_type='train'):
+        input_trajectory = batch['input_trajectory']
+        action_delta = batch['action_delta']
+        target_trajectory = batch['target_trajectory']
+        logits = self.forward(input_trajectory, action_delta)
+        loss = self.criterion(logits, target_trajectory)
+
+        self.log(f"{step_type}_loss", loss)
+        return loss
+
+    def training_step(self, batch, batch_idx):
+        return self.step(batch, batch_idx, 'train')
+
+    def validation_step(self, batch, batch_idx):
+        return self.step(batch, batch_idx, 'val')
diff --git a/networks/keypoint_deeplab.py b/networks/keypoint_deeplab.py
new file mode 100644
index 0000000..4833d49
--- /dev/null
+++ b/networks/keypoint_deeplab.py
@@ -0,0 +1,172 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import pytorch_lightning as pl
+import torchvision
+import torchvision.models.segmentation.segmentation as tvs
+
+from common.torch_util import to_numpy
+
+class KeypointDeeplab(pl.LightningModule):
+    def __init__(self, 
+            learning_rate=1e-3,
+            weight_decay=None, 
+            upsample = False,
+            **kwargs):
+        super().__init__()
+        self.save_hyperparameters()
+
+        self.deeplab_model = tvs.deeplabv3_resnet50(
+            pretrained=False, 
+            progress=False, 
+            num_classes=1, 
+            aux_loss=None)
+        # delete last layer
+        self.deeplab_model.classifier = nn.Sequential(*list(
+            self.deeplab_model.classifier)[:-1])
+        
+        # deeplab v3 plus
+        self.mid_conv = nn.Sequential(
+            nn.Conv2d(256, 48, 1, bias=False),
+            nn.BatchNorm2d(48),
+            nn.ReLU()
+        )
+
+        self.last_conv = None
+        if upsample:
+            self.last_conv = nn.Sequential(
+                nn.Conv2d(304, 256, kernel_size=3, stride=1, padding=1, bias=False),
+                nn.BatchNorm2d(256),
+                nn.ReLU(),
+                nn.Conv2d(256, 64, kernel_size=3, stride=1, padding=1, bias=False),
+                nn.BatchNorm2d(64),
+                nn.ReLU(),
+                nn.Conv2d(64, 1, kernel_size=3, stride=1, padding=1))
+        else:
+            self.last_conv = nn.Sequential(
+                nn.Conv2d(304, 256, kernel_size=3, stride=1, padding=1, bias=False),
+                nn.BatchNorm2d(256),
+                nn.ReLU(),
+                nn.ConvTranspose2d(256, 64, kernel_size=3, stride=2, padding=1, output_padding=1, bias=False),
+                nn.BatchNorm2d(64),
+                nn.ReLU(),
+                nn.ConvTranspose2d(64, 1, kernel_size=3, stride=2, padding=1, output_padding=1))
+
+        self.upsample = upsample
+        self.criterion = nn.BCEWithLogitsLoss()
+        self.learning_rate = learning_rate
+        self.weight_decay = weight_decay  
+    
+    def configure_optimizers(self):
+        learning_rate = self.learning_rate
+        weight_decay = self.weight_decay
+        optimizer = None
+        if weight_decay is None:
+            optimizer = torch.optim.Adam(
+                self.parameters(), lr=learning_rate)
+        else:
+            optimizer = torch.optim.AdamW(
+                self.parameters(), lr=learning_rate, 
+                weight_decay=weight_decay)
+        return optimizer
+    
+    def load_pretrained_weight(self):
+        model_url = tvs.model_urls['deeplabv3_resnet50_coco']
+        state_dict = tvs.load_state_dict_from_url(model_url, progress=True)
+        redundent_keys = ['classifier.4.weight', 'classifier.4.bias']
+        for key in redundent_keys:
+            del state_dict[key]
+
+        result = self.deeplab_model.load_state_dict(
+            state_dict, strict=False)
+        return result
+    
+    def backbone_forward(self, x):
+        # hack to be able to use pre-trained model
+        self = self.deeplab_model.backbone
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        l1 = x
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        l4 = x
+
+        result = {
+            'l1': l1,
+            'l4': l4
+        }
+        return result
+    
+    def forward(self, batch):
+        input_shape = batch.shape[-2:]
+        features = self.backbone_forward(batch)
+
+        # 8x down
+        features_out = features['l4']
+        x = self.deeplab_model.classifier(features_out)
+
+        # 4x down
+        low_level_features = features['l1']
+        low_level_features = self.mid_conv(low_level_features)
+
+        x = F.interpolate(
+            x, size=low_level_features.shape[2:], 
+            mode='bilinear', align_corners=True)
+        x = torch.cat((x, low_level_features), dim=1)
+        x = self.last_conv(x)
+        scoremap_raw = x
+        
+        if self.upsample:
+            # scoremap_raw is 4x down
+            scoremap = F.interpolate(
+                scoremap_raw, size=batch.shape[2:], 
+                mode='bilinear', align_corners=True)
+        else:
+            # scoremap_raw is 1x
+            scoremap = scoremap_raw
+        
+        result = {
+            'features': features,
+            'scoremap_raw': scoremap_raw,
+            'scoremap': scoremap
+        }
+        return result
+
+    def step(self, batch, batch_idx, step_type='train'):
+        input_img = batch['input_image']
+        target_scoremap = batch['target_scoremap'] # (N,3,H,W)
+        target_keypoint = batch['target_keypoint'] # (N,2) float32
+
+        result = self.forward(input_img)
+
+        scoremap = result['scoremap']
+        loss = self.criterion(scoremap, target_scoremap)
+
+        # compute keypoint distance
+        target_keypoint_np = to_numpy(target_keypoint)
+        pred_idx_np = to_numpy(torch.argmax(
+            scoremap.reshape(scoremap.shape[0], -1), 
+            dim=-1, keepdim=False))
+        pred_keypoint_np = np.stack(np.unravel_index(
+            pred_idx_np, shape=input_img.shape[2:])).T.astype(np.float32)[:,::-1]
+        keypoint_dist = np.linalg.norm(pred_keypoint_np - target_keypoint_np, axis=-1)
+        avg_keypoint_dist = np.mean(keypoint_dist)
+
+        # logging
+        self.log(f"{step_type}_loss", loss)
+        self.log(f"{step_type}_keypoint_dist", avg_keypoint_dist)
+        return loss
+
+    def training_step(self, batch, batch_idx):
+        return self.step(batch, batch_idx, 'train')
+
+    def validation_step(self, batch, batch_idx):
+        return self.step(batch, batch_idx, 'val')
+
+
diff --git a/pl_vis/image_grid_callback.py b/pl_vis/image_grid_callback.py
new file mode 100644
index 0000000..4db01c6
--- /dev/null
+++ b/pl_vis/image_grid_callback.py
@@ -0,0 +1,74 @@
+import pytorch_lightning as pl
+from pytorch_lightning.loggers import WandbLogger
+import torch
+from torch.utils.data import Subset, DataLoader
+import wandb
+import numpy as np
+from common.torch_util import dict_to
+
+
+def stack_to_grid(stack, grid_shape):
+    rows = list()
+    for i in range(grid_shape[0]):
+        row = list()
+        for j in range(grid_shape[1]):
+            idx = i * grid_shape[1] + j
+            img = stack[idx]
+            row.append(img)
+        rows.append(torch.cat(row, axis=1))
+    img = torch.cat(rows, axis=0)
+    return img
+
+
+class ImageGridCallback(pl.Callback):
+    def __init__(self, 
+            val_dataset, 
+            input_key='occupancy', 
+            output_key=None,
+            grid_shape=(3,3),
+            num_samples=16, 
+            seed=0,
+            sigmoid=True):
+        super().__init__()
+        rs = np.random.RandomState(seed=seed)
+        vis_idxs = rs.choice(len(val_dataset), num_samples)
+        vis_subset = Subset(val_dataset, vis_idxs)
+        vis_dataloader = DataLoader(
+            vis_subset, batch_size=num_samples)
+        vis_batch = next(iter(vis_dataloader))
+        self.vis_batch = vis_batch
+        self.vis_idxs = vis_idxs
+        self.input_key = input_key
+        self.output_key = output_key
+        self.sigmoid = sigmoid
+        self.grid_shape = grid_shape
+        
+    def on_validation_epoch_end(self, 
+            trainer: pl.Trainer, 
+            pl_module: pl.LightningModule) -> None:
+        vis_batch_device = dict_to(
+            self.vis_batch,
+            device=pl_module.device)        
+        result = pl_module(**vis_batch_device)
+        logits = result
+        if self.output_key is not None:
+            logits = result[self.output_key]
+        probs = logits
+        if self.sigmoid:
+            probs = torch.sigmoid(logits)        
+        probs_cpu = probs.detach().to('cpu')
+        
+        imgs = list()
+        for batch_idx in range(len(self.vis_idxs)):
+            vis_idx = self.vis_idxs[batch_idx]
+            gt_img = stack_to_grid(self.vis_batch[self.input_key][batch_idx], self.grid_shape)
+            pred_img = stack_to_grid(probs_cpu[batch_idx], self.grid_shape)
+            img_pair = torch.cat([gt_img, pred_img], dim=1)
+            imgs.append(wandb.Image(
+                img_pair, caption=f"val-{vis_idx}"
+            ))
+        
+        trainer.logger.experiment.log({
+            "val/vis": imgs,
+            "global_step": trainer.global_step
+        })
diff --git a/pl_vis/image_pair_callback.py b/pl_vis/image_pair_callback.py
new file mode 100644
index 0000000..de3faeb
--- /dev/null
+++ b/pl_vis/image_pair_callback.py
@@ -0,0 +1,61 @@
+import inspect
+import pytorch_lightning as pl
+from pytorch_lightning.loggers import WandbLogger
+import torch
+from torch.utils.data import Subset, DataLoader
+import wandb
+import numpy as np
+from common.torch_util import dict_to
+
+
+class ImagePairCallback(pl.Callback):
+    def __init__(self, 
+            val_dataset, input_key='occupancy', 
+            output_key=None,
+            num_samples=16, 
+            seed=0,
+            sigmoid=True):
+        super().__init__()
+        rs = np.random.RandomState(seed=seed)
+        vis_idxs = rs.choice(len(val_dataset), num_samples)
+        vis_subset = Subset(val_dataset, vis_idxs)
+        vis_dataloader = DataLoader(
+            vis_subset, batch_size=num_samples)
+        vis_batch = next(iter(vis_dataloader))
+        self.vis_batch = vis_batch
+        self.vis_idxs = vis_idxs
+        self.input_key = input_key
+        self.output_key = output_key
+        self.sigmoid = sigmoid
+        
+    def on_validation_epoch_end(self, 
+            trainer: pl.Trainer, 
+            pl_module: pl.LightningModule) -> None:
+        vis_batch_device = dict_to(
+            self.vis_batch,
+            device=pl_module.device)
+        func_args = set(inspect.signature(pl_module.forward).parameters.keys())
+        kwargs = dict([(x, vis_batch_device[x]) for x in func_args])
+        result = pl_module(**kwargs)
+        logits = result
+        if self.output_key is not None:
+            logits = result[self.output_key]
+        probs = logits
+        if self.sigmoid:
+            probs = torch.sigmoid(logits)        
+        probs_cpu = probs.detach().to('cpu')
+        
+        imgs = list()
+        for i in range(len(self.vis_idxs)):
+            vis_idx = self.vis_idxs[i]
+            gt_img = self.vis_batch[self.input_key][i]
+            pred_img = probs_cpu[i]
+            img_pair = torch.cat([gt_img, pred_img], dim=1)
+            imgs.append(wandb.Image(
+                img_pair, caption=f"val-{vis_idx}"
+            ))
+        
+        trainer.logger.experiment.log({
+            "val/vis": imgs,
+            "global_step": trainer.global_step
+        })
diff --git a/pl_vis/keypoint_callback.py b/pl_vis/keypoint_callback.py
new file mode 100644
index 0000000..e2e3eb9
--- /dev/null
+++ b/pl_vis/keypoint_callback.py
@@ -0,0 +1,85 @@
+import pytorch_lightning as pl
+from pytorch_lightning.loggers import WandbLogger
+import torch
+from torch.utils.data import Subset, DataLoader
+import wandb
+import numpy as np
+import cv2
+from matplotlib import cm
+
+from common.torch_util import dict_to, to_numpy
+
+
+class KeypointCallback(pl.Callback):
+    def __init__(self, 
+            val_dataset, 
+            num_samples=4, seed=0):
+        super().__init__()
+        rs = np.random.RandomState(seed=seed)
+        vis_idxs = rs.choice(len(val_dataset), num_samples)
+        vis_subset = Subset(val_dataset, vis_idxs)
+        vis_dataloader = DataLoader(
+            vis_subset, batch_size=num_samples)
+        vis_batch = next(iter(vis_dataloader))
+        self.vis_batch = vis_batch
+        self.vis_idxs = vis_idxs
+        
+    def on_validation_epoch_end(self, 
+            trainer: pl.Trainer, 
+            pl_module: pl.LightningModule) -> None:
+        vis_batch_device = dict_to(
+            self.vis_batch,
+            device=pl_module.device)
+
+        batch = vis_batch_device
+        input_img = batch['input_image']
+        # target_scoremap = batch['target_scoremap'] # (N,3,H,W)
+        target_keypoint = batch['target_keypoint'] # (N,2) float32
+
+        result = pl_module.forward(input_img)
+
+        scoremap = result['scoremap']
+        scoremap_probs = torch.sigmoid(scoremap)
+
+        # compute keypoint distance
+        target_keypoint_np = to_numpy(target_keypoint)
+        pred_idx_np = to_numpy(torch.argmax(
+            scoremap.reshape(scoremap.shape[0], -1), 
+            dim=-1, keepdim=False))
+        pred_keypoint_np = np.stack(np.unravel_index(
+            pred_idx_np, shape=input_img.shape[2:])).T.astype(np.float32)[:,::-1]
+        # keypoint_dist = np.linalg.norm(pred_keypoint_np - target_keypoint_np, axis=-1)
+
+        # two images
+        def draw_keypoint(img, pred_keypoint, target_keypoint):
+            cv2.drawMarker(img, pred_keypoint, 
+                    color=(255,0,0), markerType=cv2.MARKER_CROSS,
+                    markerSize=20, thickness=1)
+            cv2.drawMarker(img, target_keypoint, 
+                    color=(0,0,255), markerType=cv2.MARKER_CROSS,
+                    markerSize=20, thickness=1)
+
+        mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
+        std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
+        imgs = list()
+        for i in range(len(self.vis_idxs)):
+            vis_idx = self.vis_idxs[i]
+            input_rgb_img = np.moveaxis(to_numpy(input_img[i]), 0, -1)
+            input_rgb_img = np.ascontiguousarray((input_rgb_img * std + mean) * 255).astype(np.uint8)
+            scoremap_vis = np.ascontiguousarray(cm.get_cmap('viridis')(to_numpy(
+                scoremap_probs[i,0]))[:,:,:3] * 255).astype(np.uint8)
+            
+            pred_p = np.round(pred_keypoint_np[i]).astype(np.int32)
+            target_p = np.round(target_keypoint_np[i]).astype(np.int32)
+            draw_keypoint(input_rgb_img, pred_p, target_p)
+            draw_keypoint(scoremap_vis, pred_p, target_p)
+
+            img_pair = np.concatenate([input_rgb_img, scoremap_vis], axis=0)
+            imgs.append(wandb.Image(
+                img_pair, caption=f"val-{vis_idx}"
+            ))
+
+        trainer.logger.experiment.log({
+            "val/vis": imgs,
+            "global_step": trainer.global_step
+        })
diff --git a/real_ur5/buffered_tracker.py b/real_ur5/buffered_tracker.py
new file mode 100644
index 0000000..d22ae76
--- /dev/null
+++ b/real_ur5/buffered_tracker.py
@@ -0,0 +1,60 @@
+import time
+import numpy as np
+from threading import Thread
+
+import torch
+from real_ur5.zed_ram_recorder import ZedRamRecorder
+from components.tracking import KeypointTracker
+from common.torch_util import to_numpy
+
+class BufferedTracker(Thread):
+    def __init__(self, 
+            recorder: ZedRamRecorder, 
+            tracker: KeypointTracker, 
+            max_batch_size=4):
+        super().__init__()
+        assert(max_batch_size > 0)
+
+        self.recorder = recorder
+        self.tracker = tracker
+        self.max_batch_size = max_batch_size
+        # external facing
+        self.imgs_list = list()
+        self.keypoints_list = list()
+        self.confidence_list = list()
+    
+    def is_done(self):
+        return self.recorder.is_done() and (len(self.recorder.deque) == 0)
+    
+    def run(self):
+        self.recorder.start()
+        while not self.is_done():
+            batch_size = min(self.max_batch_size, len(self.recorder.deque))
+            if batch_size > 0:
+                batch_list = list()
+                for i in range(batch_size):
+                    batch_list.append(self.recorder.deque.popleft())
+                batch = np.concatenate(batch_list, axis=0)
+                keypoints, confidence = self.tracker(self.tracker.process_input(batch))
+                self.imgs_list.append(batch)
+                self.keypoints_list.append(keypoints)
+                self.confidence_list.append(confidence)
+            time.sleep(0)
+        # wait for kernels to finish
+        # important to get correct output
+        torch.cuda.synchronize(self.tracker.device)
+
+    def stop(self):
+        self.recorder.stop(blocking=False)
+
+    def get_images(self):
+        self.join()
+        return np.concatenate(self.imgs_list, axis=0)
+    
+    def get_tracking(self):
+        self.join()
+        result = {
+            'keypoints': to_numpy(torch.cat(self.keypoints_list, dim=0)),
+            'confidence': to_numpy(torch.cat(self.confidence_list, dim=0))
+        }
+        return result
diff --git a/real_ur5/delta_action_sampler.py b/real_ur5/delta_action_sampler.py
new file mode 100644
index 0000000..a51bd14
--- /dev/null
+++ b/real_ur5/delta_action_sampler.py
@@ -0,0 +1,68 @@
+import numpy as np
+from common.sample_util import get_nd_index_volume
+
+
+def get_distance(traj_img, goal_pix):
+    coord_img = get_nd_index_volume(traj_img.shape)
+    coords = coord_img[traj_img]
+    dists = np.linalg.norm(coords - goal_pix, axis=-1)
+    min_idx = np.argmin(dists)
+    dist = dists[min_idx]
+    coord = coords[min_idx]
+    return dist, coord
+
+
+class DeltaActionGridSampler:
+    def __init__(self, delta=3, grid_shape=(32,32,32)):
+        self.grid_shape = grid_shape
+        self.delta = delta
+    
+    def get_delta_action_samples(self, action: np.ndarray):
+        """
+        Assume action is 0-1 3d array float32
+        """
+        # conver to grid coord
+        delta = self.delta
+        grid_shape = self.grid_shape
+        # corner aligned
+        corners = np.array(grid_shape, dtype=np.float32) - 1
+        action_coord = (action * corners
+            ).round().astype(np.int64)
+        
+        # rejection sample
+        delta_coord = get_nd_index_volume((delta*2+1,)*3) - delta
+        next_action_coord = delta_coord + action_coord
+        is_low = np.all(next_action_coord >= np.array([0,0,0]), axis=-1)
+        is_high = np.all(next_action_coord < np.array(grid_shape), axis=-1)
+        valid_next_action_coord = next_action_coord[is_low & is_high]
+        
+        # convert to float
+        next_action_samples = valid_next_action_coord.astype(
+            np.float32) / corners
+        delta_action_samples = next_action_samples - action
+        return delta_action_samples
+
+
+class DeltaActionGaussianSampler:
+    def __init__(self, num_samples=128, seed=0, dim=3):
+        self.num_samples = num_samples
+        self.rs = np.random.RandomState(seed=seed)
+        self.dim = dim
+    
+    def get_delta_action_samples(self, 
+            action: np.ndarray, sigma: float=1/32):
+        
+        delta_actions = list()
+        while len(delta_actions) < self.num_samples:
+            n_needed = self.num_samples - len(delta_actions)
+            delta_samples = self.rs.normal(
+                loc=0, scale=sigma, 
+                size=(n_needed,self.dim))
+            action_samples = delta_samples + action
+            is_valid = np.all(
+                action_samples >= 0, axis=-1) & np.all(
+                    action_samples < 1, axis=-1)
+            valid_delta_samples = delta_samples[is_valid]
+            delta_actions.extend(valid_delta_samples)
+        delta_actions = np.array(delta_actions)
+        return delta_actions
diff --git a/real_ur5/delta_action_selector.py b/real_ur5/delta_action_selector.py
new file mode 100644
index 0000000..63c32eb
--- /dev/null
+++ b/real_ur5/delta_action_selector.py
@@ -0,0 +1,147 @@
+from typing import Callable
+import numpy as np
+import pytorch_lightning as pl
+import torch
+
+from common.torch_util import to_numpy
+from common.sample_util import ArraySlicer, get_nd_index_volume
+
+
+def dict_expand_batch(input, batch_size):
+    output = dict()
+    for key, value in input.items():
+        output[key] = value.expand(batch_size, *value.shape[1:])
+    return output
+
+
+class DeltaActionSelector:
+    def __init__(self, 
+            model: pl.LightningModule, 
+            batch_size=32):
+        self.model = model
+        self.batch_size = batch_size
+
+    def get_dist_img(self, goal_pix, img_shape):
+        """
+        Unit in pixels
+        """
+        img_coords = get_nd_index_volume(img_shape)
+        dist_img = np.linalg.norm(
+            img_coords - goal_pix, axis=-1)
+        return dist_img
+
+    def get_delta_action(self, 
+            traj_img:np.ndarray, 
+            goal_pix:np.ndarray,
+            delta_action_samples:np.ndarray,
+            threshold:float):
+        with torch.no_grad():
+            device = self.model.device
+            dtype = self.model.dtype
+            input_trajectory = torch.from_numpy(
+                traj_img.reshape(1,1,*traj_img.shape)).to(
+                    device=device, dtype=dtype, non_blocking=False)
+
+
+            num_samples = delta_action_samples.shape[0]
+            batch_size = self.batch_size
+            slicer = ArraySlicer((num_samples,), (batch_size,))
+            all_probs = list()
+            for i in range(len(slicer)):
+                this_slice = slicer[i][0]
+                this_samples = delta_action_samples[this_slice]
+                this_batch_size = len(this_samples)
+                this_input_trajectory = input_trajectory.expand(
+                    this_batch_size, *input_trajectory.shape[1:])
+                x = torch.from_numpy(this_samples).to(
+                    device=device, dtype=dtype, non_blocking=True)
+                x = torch.sigmoid(self.model(this_input_trajectory, x))
+                all_probs.append(x)
+            probs = torch.cat(all_probs, dim=0)
+            probs = to_numpy(probs[:,0,:,:]).astype(np.float32)
+            can_hit_mask = probs > threshold
+
+            # action selection
+            dist_img = self.get_dist_img(goal_pix, traj_img.shape)
+            min_dists = list()
+            for i in range(len(can_hit_mask)):
+                dists = dist_img[can_hit_mask[i]]
+                if len(dists) == 0:
+                    min_dists.append(np.inf)
+                else:
+                    min_dists.append(dists.min())
+            min_dists = np.array(min_dists)
+            best_action_idx = np.argmin(min_dists)
+            best_delta_action = delta_action_samples[best_action_idx]
+
+            result = {
+                'best_delta_action': best_delta_action,
+                'best_action_idx': best_action_idx,
+                'best_mask': can_hit_mask[best_action_idx],
+                'best_prob': probs[best_action_idx],
+                'probs': probs,
+                'distances': min_dists
+            }
+            return result
+
+
+class DeltaActionLossSelector:
+    def __init__(self, 
+            model: pl.LightningModule, 
+            batch_size=32):
+        self.model = model
+        self.batch_size = batch_size
+
+    def get_dist_img(self, goal_pix, img_shape):
+        """
+        Unit in pixels
+        """
+        img_coords = get_nd_index_volume(img_shape)
+        dist_img = np.linalg.norm(
+            img_coords - goal_pix, axis=-1)
+        return dist_img
+
+    def get_delta_action(self, 
+            traj_img:np.ndarray, 
+            delta_action_samples:np.ndarray,
+            loss_func: Callable[[np.ndarray], float],
+            threshold:float):
+        with torch.no_grad():
+            device = self.model.device
+            dtype = self.model.dtype
+            input_trajectory = torch.from_numpy(
+                traj_img.reshape(1,*traj_img.shape)).to(
+                    device=device, dtype=dtype, non_blocking=False)
+
+            num_samples = delta_action_samples.shape[0]
+            batch_size = self.batch_size
+            slicer = ArraySlicer((num_samples,), (batch_size,))
+            all_probs = list()
+            for i in range(len(slicer)):
+                this_slice = slicer[i][0]
+                this_samples = delta_action_samples[this_slice]
+                this_batch_size = len(this_samples)
+                this_input_trajectory = input_trajectory.expand(
+                    this_batch_size, *input_trajectory.shape[1:])
+                x = torch.from_numpy(this_samples).to(
+                    device=device, dtype=dtype, non_blocking=True)
+                x = torch.sigmoid(self.model(this_input_trajectory, x))
+                all_probs.append(x)
+            probs = to_numpy(torch.cat(all_probs, dim=0)).astype(np.float32)
+            can_hit_mask = probs > threshold
+
+            # action selection
+            loss_arr = list()
+            for i in range(len(can_hit_mask)):
+                loss_arr.append(loss_func(can_hit_mask[i]))
+            best_action_idx = np.argmin(loss_arr)
+            best_delta_action = delta_action_samples[best_action_idx]
+
+            result = {
+                'best_delta_action': best_delta_action,
+                'best_action_idx': best_action_idx,
+                'best_mask': can_hit_mask[best_action_idx],
+                'best_prob': probs[best_action_idx],
+                'probs': probs
+            }
+            return result
diff --git a/real_ur5/swing_actor.py b/real_ur5/swing_actor.py
new file mode 100644
index 0000000..d5e1294
--- /dev/null
+++ b/real_ur5/swing_actor.py
@@ -0,0 +1,62 @@
+import time
+import numpy as np
+import scipy.interpolate as si
+from rtde_control import RTDEControlInterface
+from rtde_receive import RTDEReceiveInterface
+
+
+def deg_to_rad(deg):
+    return deg / 180 * np.pi
+
+
+class SwingActor:
+    def __init__(self, 
+            robot_ip='192.168.0.139',
+            q_init=(-90,-70,150,-170,-90,-90), 
+            speed_range=(1.0,3.14),
+            j2_delta_range=(-60,-180),
+            j3_delta_range=(60,-120),
+            tcp_offset=0.5) -> None:
+        self.q_init = deg_to_rad(np.array(q_init))
+        self.speed_func = si.interp1d((0,1),speed_range)
+        self.j2_delta_func = si.interp1d(
+            (0,1), deg_to_rad(np.array(j2_delta_range)))
+        self.j3_delta_func = si.interp1d(
+            (0,1), deg_to_rad(np.array(j3_delta_range)))
+
+        tcp_offset_pose = [0.0]*6
+        tcp_offset_pose[2] = tcp_offset
+        self.tcp_offset_pose = tcp_offset_pose
+        self.robot_ip = robot_ip
+        self.rtde_c = None
+        self.rtde_r = None
+    
+    def __enter__(self):
+        self.rtde_c = RTDEControlInterface(self.robot_ip)
+        self.rtde_r = RTDEReceiveInterface(self.robot_ip)
+        self.rtde_c.setTcp(self.tcp_offset_pose)
+        return self
+    
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        self.rtde_c.stopScript()
+        self.rtde_c.disconnect()
+        self.rtde_r.disconnect()
+    
+    def get_action(self, spd, j2, j3):
+        this_speed = self.speed_func(spd)
+        this_q_goal = self.q_init.copy()
+        this_q_goal[2] += self.j2_delta_func(j2)
+        this_q_goal[3] += self.j3_delta_func(j3)
+        return this_speed, this_q_goal
+    
+    def reset(self, acc=10.0, speed=3.0, blocking=True):
+        self.rtde_c.moveJ(
+            self.q_init.tolist(), 
+            speed, acc, not blocking)
+    
+    def swing(self, spd, j2, j3, acc=10.0, blocking=True):
+        speed, q_goal = self.get_action(spd, j2, j3)
+        self.rtde_c.moveJ(
+            q_goal.tolist(), 
+            speed, acc, not blocking)
+
diff --git a/real_ur5/trajectory_projector.py b/real_ur5/trajectory_projector.py
new file mode 100644
index 0000000..a560eae
--- /dev/null
+++ b/real_ur5/trajectory_projector.py
@@ -0,0 +1,52 @@
+import numpy as np
+
+from common.sample_util import GridCoordTransformer
+from common.cv_util import get_traj_occupancy
+from common.geometry_util import homo_transform
+
+class TrajectoryProjector:
+    def __init__(self, 
+            tx_img_robot,
+            transformer=None, 
+            flip_x=True,
+            confidence_threshold=0.3,
+            **kwargs):
+        if transformer is None:
+            transformer = self.get_default_transformer()
+        self.flip_x=flip_x
+        self.transformer = transformer
+        self.tx_img_robot = tx_img_robot
+        self.tx_robot_img = np.linalg.inv(tx_img_robot)
+        self.confidence_threshold = confidence_threshold
+    
+    @staticmethod
+    def get_default_transformer():
+        img_shape = (256, 256)
+        transformer = GridCoordTransformer(
+            (-3.0,-3.0),(3.0,3.0),img_shape)
+        return transformer
+    
+    def to_robot_frame(self, keypoints):
+        return homo_transform(self.tx_robot_img, keypoints)
+    
+    def robot_to_image(self, keypoints):
+        return homo_transform(self.tx_img_robot, keypoints)
+    
+    def grid_to_image(self, grid_points):
+        sim_points = self.transformer.from_grid(grid_points)
+        if self.flip_x:
+            sim_points[:,0] *= -1
+        return homo_transform(self.tx_img_robot, sim_points)
+    
+    def get_sim_traj_img(self, tracking_result):
+        keypoints = tracking_result['keypoints'][:,0,:]
+        confidence = tracking_result['confidence'][:,0]
+        mask = confidence > self.confidence_threshold
+        valid_keypoints = keypoints[mask]
+
+        sim_points = self.to_robot_frame(valid_keypoints)
+        if self.flip_x:
+            # robot setup is flipped with simulation
+            sim_points[:,0] *= -1
+        traj_img = get_traj_occupancy(sim_points, self.transformer)
+        return traj_img
diff --git a/real_ur5/zed_camera.py b/real_ur5/zed_camera.py
new file mode 100644
index 0000000..ad006ee
--- /dev/null
+++ b/real_ur5/zed_camera.py
@@ -0,0 +1,97 @@
+import numpy as np
+import pyzed.sl as sl
+import time
+
+class ZedCamera:    
+    def __init__(self, 
+            init_params=None, 
+            video_settings=None, 
+            left=True,
+            right=False):
+        if init_params is None:
+            init_params = self.get_default_init_params()
+        if video_settings is None:
+            video_settings = self.get_default_video_settings()
+        assert(left or right)
+
+        self.init_params = init_params
+        self.video_settings = video_settings
+        self.left = left
+        self.right = right
+        self.camera = None
+        self.mat = sl.Mat()
+
+    @staticmethod
+    def get_default_init_params():
+        return sl.InitParameters(
+            camera_resolution=sl.RESOLUTION.HD720,
+            camera_fps=60,
+            depth_mode=sl.DEPTH_MODE.NONE,
+            sdk_verbose=True,
+            sdk_gpu_id=0,
+            enable_image_enhancement=True
+        )
+    
+    @staticmethod
+    def get_default_video_settings():
+        # return {
+        #     'BRIGHTNESS': 4,
+        #     'CONTRAST': 4,
+        #     'HUE': 0,
+        #     'SATURATION': 4,
+        #     'SHARPNESS': 4,
+        #     'GAMMA': 9,
+        #     'GAIN': 100,
+        #     'EXPOSURE': 6,
+        #     'WHITEBALANCE_TEMPERATURE': 3200,
+        #     'LED_STATUS': 1
+        # }
+        return {
+            'BRIGHTNESS': 4,
+            'CONTRAST': 4,
+            'HUE': 0,
+            'SATURATION': 4,
+            'SHARPNESS': 4,
+            'GAMMA': 6,
+            'GAIN': 100,
+            'EXPOSURE': 9,
+            'WHITEBALANCE_TEMPERATURE': 4200,
+            'LED_STATUS': 1
+        }
+    
+    def is_opened(self):
+        return self.camera.is_opened()
+    
+    def __enter__(self):
+        zed = sl.Camera()
+        err = zed.open(self.init_params)
+        if (err != sl.ERROR_CODE.SUCCESS):
+            print("Unable to start camera.")
+            return
+
+        time.sleep(0.5)
+        for key, value in self.video_settings.items():
+            zed.set_camera_settings(getattr(sl.VIDEO_SETTINGS, key), value)
+
+        self.camera = zed
+        return self
+    
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        self.camera.close()
+    
+    def get_image(self) -> np.ndarray:
+        """
+        Return ndarray
+        (N,H,W,4) uint8 bgra
+        """
+        err = self.camera.grab()
+        if (err == sl.ERROR_CODE.SUCCESS):
+            imgs = list()
+            if self.left:
+                self.camera.retrieve_image(self.mat, sl.VIEW.LEFT)
+                imgs.append(self.mat.get_data())
+            if self.right:
+                self.camera.retrieve_image(self.mat, sl.VIEW.RIGHT)
+                imgs.append(self.mat.get_data())
+            return np.array(imgs)
+        return None
diff --git a/real_ur5/zed_ram_recorder.py b/real_ur5/zed_ram_recorder.py
new file mode 100644
index 0000000..4de9719
--- /dev/null
+++ b/real_ur5/zed_ram_recorder.py
@@ -0,0 +1,81 @@
+import time
+import numpy as np
+from real_ur5.zed_camera import ZedCamera
+import pyzed.sl as sl
+from threading import Thread, Event
+import collections
+
+
+class ZedRamRecorder(Thread):
+    def __init__(self, camera: ZedCamera, maxlen=6000):
+        super().__init__()
+        assert(camera.is_opened())
+        self.camera = camera
+        self.deque = collections.deque(maxlen=maxlen)
+        self.maxlen = maxlen
+        self.counter = 0
+        self.stop_event = Event()
+    
+    def is_done(self):
+        return not((not self.stop_event.is_set()) and (self.counter < self.maxlen))
+    
+    def run(self):
+        while not self.is_done():
+            img = self.camera.get_image()
+            if img is not None:
+                self.deque.append(img)
+                self.counter += 1
+            time.sleep(0)
+
+    def stop(self, blocking=True):
+        self.stop_event.set()
+        if blocking:
+            self.join()
+    
+    def __del__(self):
+        self.stop(blocking=False)
+
+
+class SVORamRecorder(Thread):
+    def __init__(self, 
+            svo_path,
+            left=True,
+            right=False):
+        super().__init__()
+        init_params = sl.InitParameters()
+        init_params.set_from_svo_file(svo_path)
+        camera = sl.Camera()
+        assert(camera.open(init_params) == sl.ERROR_CODE.SUCCESS)
+        self.camera = camera
+        self.deque = collections.deque()
+        self.stop_event = Event()
+        self.left = left
+        self.right = right
+        self.mat = sl.Mat()
+    
+    def is_done(self):
+        return self.stop_event.is_set()
+    
+    def run(self):
+        camera = self.camera
+        while not self.is_done():        
+            err = camera.grab()
+            if err != sl.ERROR_CODE.SUCCESS:
+                self.stop_event.set()
+                break
+            imgs = list()
+            if self.left:
+                self.camera.retrieve_image(self.mat, sl.VIEW.LEFT)
+                imgs.append(self.mat.get_data())
+            if self.right:
+                self.camera.retrieve_image(self.mat, sl.VIEW.RIGHT)
+                imgs.append(self.mat.get_data())
+            self.deque.append(np.array(imgs))
+        
+    def stop(self, blocking=True):
+        self.stop_event.set()
+        if blocking:
+            self.join()
+
+    def __del__(self):
+        self.stop(blocking=False)
diff --git a/train_irp.py b/train_irp.py
new file mode 100644
index 0000000..3ae87fc
--- /dev/null
+++ b/train_irp.py
@@ -0,0 +1,70 @@
+# %%
+# import
+import os
+import pathlib
+import yaml
+import hydra
+from omegaconf import DictConfig, OmegaConf
+import pytorch_lightning as pl
+
+from datasets.delta_trajectory_gaussian_dataset import DeltaTrajectoryGaussianDataModule
+from networks.delta_trajectory_deeplab import DeltaTrajectoryDeeplab
+from pl_vis.image_pair_callback import ImagePairCallback
+
+
+# %%
+# main script
+@hydra.main(config_path="config", config_name=pathlib.Path(__file__).stem)
+def main(cfg: DictConfig) -> None:
+    # hydra creates working directory automatically
+    print(os.getcwd())
+    os.mkdir("checkpoints")
+
+    datamodule = DeltaTrajectoryGaussianDataModule(**cfg.datamodule)
+    cfg.model.action_sigma = cfg.datamodule.action_sigma
+    model = DeltaTrajectoryDeeplab(**cfg.model)
+
+    logger = pl.loggers.WandbLogger(
+        project=os.path.basename(__file__),
+        **cfg.logger)
+    wandb_run = logger.experiment
+    wandb_meta = {
+        'run_name': wandb_run.name,
+        'run_id': wandb_run.id
+    }
+    all_config = {
+        'config': OmegaConf.to_container(cfg, resolve=True),
+        'output_dir': os.getcwd(),
+        'wandb': wandb_meta
+    }
+    yaml.dump(all_config, open('config.yaml', 'w'), default_flow_style=False)
+    logger.log_hyperparams(all_config)
+
+    datamodule.prepare_data()
+    val_dataset = datamodule.get_dataset('val')
+
+    checkpoint_callback = pl.callbacks.ModelCheckpoint(
+        dirpath="checkpoints",
+        filename="{epoch}-{val_loss:.4f}",
+        monitor='val_loss',
+        save_last=True,
+        save_top_k=5,
+        mode='min', 
+        save_weights_only=False, 
+        every_n_epochs=1,
+        save_on_train_epoch_end=True)
+    vis_callback = ImagePairCallback(
+        val_dataset,
+        **cfg.vis_callback
+    )
+    trainer = pl.Trainer(
+        callbacks=[checkpoint_callback, vis_callback],
+        checkpoint_callback=True,
+        logger=logger, 
+        **cfg.trainer)
+    trainer.fit(model=model, datamodule=datamodule)
+
+# %%
+# driver
+if __name__ == "__main__":
+    main()
diff --git a/train_tracker.py b/train_tracker.py
new file mode 100644
index 0000000..67c072d
--- /dev/null
+++ b/train_tracker.py
@@ -0,0 +1,71 @@
+# %%
+# import
+import os
+import pathlib
+import yaml
+import hydra
+from omegaconf import DictConfig, OmegaConf
+import pytorch_lightning as pl
+
+from datasets.keypoint_imgaug_dataset import KeypointImgaugDataModule
+from networks.keypoint_deeplab import KeypointDeeplab
+from pl_vis.keypoint_callback import KeypointCallback
+
+# %%
+# main script
+@hydra.main(config_path="config", config_name=pathlib.Path(__file__).stem)
+def main(cfg: DictConfig) -> None:
+    # hydra creates working directory automatically
+    print(os.getcwd())
+    os.mkdir("checkpoints")
+
+    datamodule = KeypointImgaugDataModule(**cfg.datamodule)
+    model = KeypointDeeplab(**cfg.model)
+    model.load_pretrained_weight()
+
+    logger = pl.loggers.WandbLogger(
+        project=os.path.basename(__file__),
+        **cfg.logger)
+    wandb_run = logger.experiment
+    wandb_meta = {
+        'run_name': wandb_run.name,
+        'run_id': wandb_run.id
+    }
+    all_config = {
+        'config': OmegaConf.to_container(cfg, resolve=True),
+        'output_dir': os.getcwd(),
+        'wandb': wandb_meta
+    }
+    yaml.dump(all_config, open('config.yaml', 'w'), default_flow_style=False)
+    logger.log_hyperparams(all_config)
+
+    datamodule.prepare_data()
+    val_dataset = datamodule.get_dataset('val')
+
+    checkpoint_callback = pl.callbacks.ModelCheckpoint(
+        dirpath="checkpoints",
+        # filename="{epoch}-{val_loss:.4f}",
+        # monitor='val_loss',
+        filename="{epoch}-{val_keypoint_dist:.4f}",
+        monitor='val_keypoint_dist',
+        save_last=True,
+        save_top_k=5,
+        mode='min', 
+        save_weights_only=False, 
+        every_n_epochs=1,
+        save_on_train_epoch_end=True)
+    vis_callback = KeypointCallback(
+        val_dataset,
+        **cfg.vis_callback
+    )
+    trainer = pl.Trainer(
+        callbacks=[checkpoint_callback, vis_callback],
+        checkpoint_callback=True,
+        logger=logger, 
+        **cfg.trainer)
+    trainer.fit(model=model, datamodule=datamodule)
+
+# %%
+# driver
+if __name__ == "__main__":
+    main()
diff --git a/ur5_camera_calibration_app.py b/ur5_camera_calibration_app.py
new file mode 100644
index 0000000..b8e8af8
--- /dev/null
+++ b/ur5_camera_calibration_app.py
@@ -0,0 +1,305 @@
+# %%
+import argparse
+import pathlib
+import pickle
+
+import numpy as np
+import cv2
+import pyzed.sl as sl
+from rtde_control import RTDEControlInterface
+from rtde_receive import RTDEReceiveInterface
+
+# %%
+class Robot:
+    def __init__(self, robot_ip, tcp_offset=0.5):
+        tcp_offset_pose = [0.0]*6
+        tcp_offset_pose[2] = tcp_offset
+        self.tcp_offset_pose = tcp_offset_pose
+
+        self.robot_ip = robot_ip
+        self.rtde_c = None
+        self.rtde_r = None
+    
+    def __enter__(self):
+        self.rtde_c = RTDEControlInterface(self.robot_ip)
+        self.rtde_r = RTDEReceiveInterface(self.robot_ip)
+        self.rtde_c.setTcp(self.tcp_offset_pose)
+        return self
+    
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        self.rtde_c.stopScript()
+        self.rtde_c.disconnect()
+        self.rtde_r.disconnect()
+    
+    def get_tcp_point(self):
+        tcp_pose = self.rtde_r.getActualTCPPose()
+        tcp_point = tcp_pose[:3]
+        return tcp_point
+
+def deg_to_rad(deg):
+    return deg / 180 * np.pi
+
+
+def robot_init(robot):
+    j_init = deg_to_rad(np.array([-90,-70,150,-170,-90,-90]))
+    return robot.rtde_c.moveJ(
+        j_init.tolist(), 1.0, 1.0, True)
+
+
+def robot_move_joint(robot, joint_id, delta, acc=1.0, speed=1.0):
+    new_j = np.array(robot.rtde_r.getActualQ())
+    new_j[joint_id] += delta
+
+    return robot.rtde_c.moveJ(
+        new_j.tolist(), acc, speed, True)
+
+class Camera:
+    def __init__(self):
+        self.camera = None
+    
+    def __enter__(self):
+        init_params = sl.InitParameters(
+            camera_resolution=sl.RESOLUTION.HD720,
+            camera_fps=60,
+            depth_mode=sl.DEPTH_MODE.NONE,
+            sdk_verbose=True,
+            sdk_gpu_id=0,
+            enable_image_enhancement=True
+        )
+        zed = sl.Camera()
+        err = zed.open(init_params)
+        assert(err == sl.ERROR_CODE.SUCCESS)
+
+        video_setting_dict = {
+            'BRIGHTNESS': 4,
+            'CONTRAST': 4,
+            'HUE': 0,
+            'SATURATION': 4,
+            'SHARPNESS': 4,
+            'GAMMA': 5,
+            'AEC_AGC': 1,
+            'WHITEBALANCE_AUTO': 1,
+            'LED_STATUS': 1
+        }
+
+        for key, value in video_setting_dict.items():
+            zed.set_camera_settings(getattr(sl.VIDEO_SETTINGS, key), value)
+
+        self.camera = zed
+        return self
+    
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        self.camera.close()
+    
+    def get_image(self):
+        mat = sl.Mat()
+        self.camera.grab()
+        self.camera.retrieve_image(mat, sl.VIEW.LEFT)
+        img = mat.get_data()
+        return img
+
+
+def homo_transform(mat, points):
+    if mat.shape[1] != points.shape[1]:
+        points = np.concatenate([points, np.ones((points.shape[0],1))], axis=1)
+    homo = points @ mat.T
+    result = (homo[:,:homo.shape[1]-1].T / homo[:,-1]).T
+    return result
+
+
+class UR5CalibrationApp:
+    def __init__(self, camera, robot, window_name='left_image'):
+        self.window_name = window_name
+        self.current_img_point = None
+        self.current_robot_point = None
+        self.data = list()
+
+        self.camera = camera
+        self.robot = robot
+
+    def start(self):
+        cv2.namedWindow(self.window_name)
+        cv2.setMouseCallback(self.window_name, self.mouse_callback)
+    
+    def end(self):
+        cv2.destroyWindow(self.window_name)
+
+    def mouse_callback(self, action, x, y, flags, *userdata):
+        if action == cv2.EVENT_LBUTTONDOWN:
+            coord = (x,y)
+            print('click', coord)
+            self.current_img_point = coord
+    
+    def step(self):
+        img = self.camera.get_image()
+        self.current_img = img.copy()
+        self.draw_calibration(img)
+        if self.current_img_point is not None:
+            cv2.drawMarker(img, self.current_img_point, 
+                color=(0,0,255), markerType=cv2.MARKER_CROSS,
+                markerSize=20, thickness=1)
+        cv2.imshow(self.window_name, img)
+        key = cv2.waitKey(17)
+        return key
+    
+    def update_robot_point(self):
+        self.current_robot_point = self.robot.get_tcp_point()
+
+    def clear_coord(self):
+        self.current_img_point = None
+        self.current_robot_point = None
+    
+    def save_coord(self):
+        if self.current_robot_point is None:
+            print("no robot point")
+        elif self.current_img_point is None:
+            print("no clicked point")
+        else:
+            self.data.append({
+                'robot_point': self.current_robot_point,
+                'img_point': self.current_img_point
+            })
+    
+    def compute_homography(self):
+        # assuming x won't change
+        robot_points_2d = self.get_robot_points()[:,1:]
+        img_points = self.get_image_points()
+        # least squares
+        tx_img_robot, confidence = cv2.findHomography(
+            robot_points_2d, img_points)
+        return tx_img_robot
+    
+    def get_robot_points(self):
+        return np.array([x['robot_point'] for x in self.data])
+    
+    def get_image_points(self):
+        return np.array([
+            x['img_point'] for x in self.data], 
+            dtype=np.float64)
+    
+    def __len__(self):
+        return len(self.data)
+    
+    def draw_calibration(self, img):
+        if len(self) >= 4:
+            tx_img_robot = self.compute_homography()
+            # draw box
+            robot_corners = np.array([
+                [1,0],
+                [1,2],
+                [-1,2],
+                [-1,0]
+            ], dtype=np.float64)
+            img_conrers = homo_transform(
+                tx_img_robot, robot_corners)
+            cv2.polylines(img, 
+                [img_conrers.round().astype(np.int32)], 
+                isClosed=True, color=(255,0,0))
+            # draw workspace
+            robot_corners = np.array([
+                [-3,-3],
+                [3,-3],
+                [3,3],
+                [-3,3]
+            ], dtype=np.float64)
+            img_conrers = homo_transform(
+                tx_img_robot, robot_corners)
+            cv2.polylines(img, 
+                [img_conrers.round().astype(np.int32)], 
+                isClosed=True, color=(0,0,0))
+
+            # draw robot points
+            robot_points_2d = self.get_robot_points()[:,1:]
+            val_img_points = homo_transform(
+                tx_img_robot, robot_points_2d)
+            for point in val_img_points:
+                cv2.drawMarker(img, point.round().astype(np.int32), 
+                color=(255,0,0), markerType=cv2.MARKER_CROSS,
+                markerSize=20, thickness=1)            
+        if len(self) > 0:
+            # draw image points
+            img_points = self.get_image_points()
+            for point in img_points:
+                cv2.drawMarker(img, point.round().astype(np.int32), 
+                color=(0,255,0), markerType=cv2.MARKER_CROSS,
+                markerSize=20, thickness=1)
+    
+    def save_calibration(self, fname):
+        if len(self) < 4:
+            print('Calibration requires at least 4 points!')
+            return
+
+        path = pathlib.Path(fname)
+        path = path.with_suffix('.pkl')
+        path.parent.mkdir(parents=True, exist_ok=True)
+        calib_data = dict(self.data)
+        calib_data['tx_img_robot'] = self.compute_homography()
+        pickle.dump(calib_data, path.open('wb'))
+        print(f"Calibration file saved to {path.absolute()}")
+    
+    def pop(self):
+        if len(self.data) > 0:
+            return self.data.pop()
+
+
+# %%
+def main():
+    parser = argparse.ArgumentParser(
+        'Planar calibration\nWASD control\nI to init robot\nQ to quit\n')
+    parser.add_argument('-o', '--output', help='calibration file', required=True)
+    parser.add_argument('--ip', default='192.168.0.139', help='robot ip')
+    parser.add_argument('--delta', default=40, help="action delta in degrees")
+
+    args = parser.parse_args()
+
+
+    with Robot(robot_ip=args.ip) as robot:
+        with Camera() as camera:
+            app = UR5CalibrationApp(camera, robot)
+            app.start()
+            while True:
+                delta = args.delta / 180 * 3.14
+                key = app.step()
+                if key == ord('q'):
+                    print('exit')
+                    app.save_calibration(args.output)
+                    break
+                elif key == ord('i'):
+                    app.clear_coord()
+                    robot_init(robot)
+                elif key == ord('e'):
+                    print('delete')
+                    app.clear_coord()
+                elif key == 13:
+                    print('enter')
+                    app.save_calibration(args.output)
+                elif key == 8:
+                    print('backspace')
+                    app.pop()
+                # move
+                elif key == ord('w'):
+                    app.save_coord()
+                    app.clear_coord()
+                    robot_move_joint(robot, 2, -delta)
+                elif key == ord('s'):
+                    app.save_coord()
+                    app.clear_coord()
+                    robot_move_joint(robot, 2, delta)
+                elif key == ord('a'):
+                    app.save_coord()
+                    app.clear_coord()
+                    robot_move_joint(robot, 3, -delta)
+                elif key == ord('d'):
+                    app.save_coord()
+                    app.clear_coord()
+                    robot_move_joint(robot, 3, delta)
+                # if key != -1:
+                #     print("key:", key)
+                app.update_robot_point()
+            
+            app.end()
+    return
+
+# %%
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/video_labeler.py b/video_labeler.py
new file mode 100644
index 0000000..bff8cc2
--- /dev/null
+++ b/video_labeler.py
@@ -0,0 +1,136 @@
+# %%
+import os
+import pathlib
+import json
+import argparse
+
+import cv2
+import skvideo.io
+
+# %%
+class VideoLabeler:
+    def __init__(self, video_path, label_dir):
+        frames = skvideo.io.vread(video_path)
+        self.frames = frames
+        self.i = 0
+
+        # load label state
+        pathlib.Path(label_dir).mkdir(parents=True, exist_ok=True)
+        self.label_dir = label_dir
+        self.label_dict = dict()
+        if os.path.isfile(self.json_path):
+            self.label_dict = json.load(open(self.json_path, 'r'))
+
+        self.image_buffer = dict()
+    
+    @property
+    def curr_index(self):
+        return self.i
+
+    def __len__(self):
+        return len(self.frames)
+    
+    @property
+    def json_path(self):
+        return os.path.join(self.label_dir, 'labels.json')
+
+    def save_labels(self):
+        pathlib.Path(self.label_dir).mkdir(parents=True, exist_ok=True)
+        json.dump(self.label_dict, open(self.json_path, 'w'), indent=2)
+
+    def save_images(self):
+        pathlib.Path(self.label_dir).mkdir(parents=True, exist_ok=True)
+        # glob
+        files = pathlib.Path(self.label_dir).glob('*.jpg')
+        file_path_map = dict()
+        for file in files:
+            key = file.stem
+            path = str(file.absolute())
+            file_path_map[key] = path
+        
+        # delete unlabeled images
+        for key, path in file_path_map.items():
+            if key not in self.label_dict:
+                os.remove(path)
+
+        # save unsaved images
+        for key, img in self.image_buffer.items():
+            path = os.path.join(self.label_dir, key + '.jpg')
+            cv2.imwrite(path, img[:,:,[2,1,0]])
+
+        self.image_buffer = dict()
+
+    def add_label(self, coord):
+        key = str(self.curr_index)
+        self.label_dict[key] = coord
+        self.image_buffer[key] = self.frames[self.curr_index]
+
+    def delete_label(self):
+        key = str(self.curr_index)
+        self.label_dict.pop(key, None)
+        self.image_buffer.pop(key, None)
+
+    def next_frame(self):
+        self.i = min(self.i + 1, len(self.frames) - 1)
+        return self.i
+    
+    def prev_frame(self):
+        self.i = max(self.i - 1, 0)
+        return self.i
+    
+    def get_curr_img(self):
+        vis_img = self.frames[self.curr_index].copy()
+        key = str(self.curr_index)
+        if key in self.label_dict:
+            coord = self.label_dict[key]
+            cv2.drawMarker(vis_img, coord, 
+                color=(255,0,0), markerType=cv2.MARKER_CROSS,
+                markerSize=20, thickness=1)
+        vis_img = vis_img[:,:,[2,1,0]].copy()
+        return vis_img
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-i', '--input', required=True)
+    parser.add_argument('-o', '--output', required=True)
+    args = parser.parse_args()
+
+    state = VideoLabeler(args.input, args.output)
+    def callback(action, x, y, flags, *userdata):
+        if action == cv2.EVENT_LBUTTONDOWN:
+            coord = (x,y)
+            print(coord)
+            state.add_label(coord=coord)
+    
+    cv2.namedWindow("img", cv2.WINDOW_NORMAL)
+    cv2.setMouseCallback("img", callback)
+
+    while True:
+        cv2.imshow("img", state.get_curr_img())
+        key = cv2.waitKey(17)
+
+
+        if key == ord('q'):
+            print('exit')
+            break
+        elif key == ord('a'):
+            print('prev')
+            frame = state.prev_frame()
+            print(f'{frame}/{len(state)}')
+        elif key == ord('d'):
+            print('next')
+            frame = state.next_frame()
+            print(f'{frame}/{len(state)}')
+        elif key == 255:
+            print('delete')
+            state.delete_label()
+        elif key == 13:
+            print('save')
+            state.save_labels()
+            state.save_images()
+        # if key != -1:
+        #     print("key:", key)
+
+# %%
+if __name__ == '__main__':
+    main()