diff --git a/.gitignore b/.gitignore
index 337123a38e..e922fda00b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 __pycache__
 *.pyc
 .DS_Store
+.vscode
 *.egg-info
 *.pkl
 *~
diff --git a/predicators/approaches/documentation/grammar_search_invention_approach.md b/predicators/approaches/documentation/grammar_search_invention_approach.md
index 2919dfa218..7d629f0d78 100644
--- a/predicators/approaches/documentation/grammar_search_invention_approach.md
+++ b/predicators/approaches/documentation/grammar_search_invention_approach.md
@@ -4,7 +4,7 @@ This approach is primarily useful for inventing predicates via program synthesis
 
 An example command for running the approach from that paper is:
 ```
-python predicators/main.py --env cover --approach grammar_search_invention --excluded_predicates all --num_train_tasks 50
+python predicators/main.py --env cover --approach grammar_search_invention --excluded_predicates all --num_train_tasks 50 --seed 0
 ```
 
 Last updated: 04/28/2024
@@ -64,9 +64,9 @@ apple_coring__vlm_demos__456__2
 ### Running predicate invention using these image demos
 To use the Gemini VLM, you need to set the `GOOGLE_API_KEY` environment variable in your terminal. You can make/get an API key [here](https://aistudio.google.com/app/apikey).
 
-Example command: `python predicators/main.py --env apple_coring --seed 456 --approach grammar_search_invention --excluded_predicates all --num_train_tasks 1 --num_test_tasks 0 --offline_data_method img_demos --vlm_trajs_folder_name apple_coring__vlm_demos__456__1`
+Example command: `python predicators/main.py --env apple_coring --seed 456 --approach grammar_search_invention --excluded_predicates all --num_train_tasks 1 --num_test_tasks 0 --offline_data_method saved_vlm_img_demos_folder --vlm_trajs_folder_name apple_coring__vlm_demos__456__1`
 
-The important flags here are the `--offline_data_method img_demos` and the `--vlm_trajs_folder_name apple_coring__vlm_demos__456__1`. The latter should point to the folder housing the demonstration set of interest!
+The important flags here are the `--offline_data_method saved_vlm_img_demos_folder` and the `--vlm_trajs_folder_name apple_coring__vlm_demos__456__1`. The latter should point to the folder housing the demonstration set of interest!
 
 Note that VLM responses are always cached, so if you run the command on a demonstration set and then rerun it, it should be much faster since it's using cached responses!
 
diff --git a/predicators/approaches/grammar_search_invention_approach.py b/predicators/approaches/grammar_search_invention_approach.py
index c4fbf5b967..cbcecbc0b0 100644
--- a/predicators/approaches/grammar_search_invention_approach.py
+++ b/predicators/approaches/grammar_search_invention_approach.py
@@ -341,7 +341,8 @@ def _quantified_types(self) -> List[Type]:
         ]
 
     def _classify_object(self, s: State, obj: Object) -> bool:
-        assert obj.type == self.body.types[self.free_variable_idx]
+        # assert obj.type == self.body.types[self.free_variable_idx]
+        assert obj.is_instance(self.body.types[self.free_variable_idx])
         for o in utils.get_object_combinations(set(s), self._quantified_types):
             o_lst = list(o)
             o_lst.insert(self.free_variable_idx, obj)
@@ -995,7 +996,10 @@ def _parse_atom_dataset_from_annotated_dataset(
         return (atom_dataset, candidates)
 
     def learn_from_offline_dataset(self, dataset: Dataset) -> None:
-        if not CFG.offline_data_method == "demo+labelled_atoms":
+        if not CFG.offline_data_method in [
+                "demo+labelled_atoms", "saved_vlm_img_demos_folder",
+                "demo_with_vlm_imgs"
+        ]:
             atom_dataset, candidates = self._generate_atom_dataset_via_grammar(
                 dataset)
         else:
diff --git a/predicators/approaches/nsrt_learning_approach.py b/predicators/approaches/nsrt_learning_approach.py
index 5594a2ff60..d6dcbf06ff 100644
--- a/predicators/approaches/nsrt_learning_approach.py
+++ b/predicators/approaches/nsrt_learning_approach.py
@@ -74,6 +74,30 @@ def _learn_nsrts(self, trajectories: List[LowLevelTrajectory],
             ground_atom_dataset = utils.create_ground_atom_dataset(
                 trajectories, self._get_current_predicates())
             utils.save_ground_atom_dataset(ground_atom_dataset, dataset_fname)
+        elif CFG.offline_data_method in [
+                "demo+labelled_atoms", "saved_vlm_img_demos_folder",
+                "demo_with_vlm_imgs"
+        ]:
+            # In this case, the annotations are basically ground atoms!
+            # We can use these to make GroundAtomTrajectories.
+            assert annotations is not None
+            assert len(annotations) == len(trajectories)
+            ground_atom_dataset = []
+            annotations_with_only_selected_preds = []
+            selected_preds = self._get_current_predicates()
+            for atoms_traj in annotations:
+                curr_selected_preds_atoms_traj = []
+                for atoms_set in atoms_traj:
+                    curr_selected_preds_atoms_set = set(
+                        atom for atom in atoms_set
+                        if atom.predicate in selected_preds)
+                    curr_selected_preds_atoms_traj.append(
+                        curr_selected_preds_atoms_set)
+                annotations_with_only_selected_preds.append(
+                    curr_selected_preds_atoms_traj)
+            for ll_traj, atoms in zip(trajectories,
+                                      annotations_with_only_selected_preds):
+                ground_atom_dataset.append((ll_traj, atoms))
         self._nsrts, self._segmented_trajs, self._seg_to_nsrt = \
             learn_nsrts_from_data(trajectories,
                                   self._train_tasks,
diff --git a/predicators/args.py b/predicators/args.py
index 56a085a454..537c45fffd 100644
--- a/predicators/args.py
+++ b/predicators/args.py
@@ -28,6 +28,7 @@ def create_arg_parser(env_required: bool = True,
     parser.add_argument("--make_failure_videos", action="store_true")
     parser.add_argument("--make_interaction_videos", action="store_true")
     parser.add_argument("--make_demo_videos", action="store_true")
+    parser.add_argument("--make_demo_images", action="store_true")
     parser.add_argument("--make_cogman_videos", action="store_true")
     parser.add_argument("--load_approach", action="store_true")
     # In the case of online learning approaches, load_approach by itself
diff --git a/predicators/cogman.py b/predicators/cogman.py
index 4401f60c98..b0c4848cf7 100644
--- a/predicators/cogman.py
+++ b/predicators/cogman.py
@@ -9,16 +9,20 @@
 The name "CogMan" is due to Leslie Kaelbling.
 """
 import logging
-from typing import Callable, List, Optional, Sequence, Set
+import time
+from collections import defaultdict
+from typing import Callable, List, Optional, Sequence, Set, Tuple
+from typing import Type as TypingType
 
 from predicators import utils
 from predicators.approaches import BaseApproach
+from predicators.envs import BaseEnv
 from predicators.execution_monitoring import BaseExecutionMonitor
 from predicators.perception import BasePerceiver
 from predicators.settings import CFG
 from predicators.structs import Action, Dataset, EnvironmentTask, GroundAtom, \
     InteractionRequest, InteractionResult, LowLevelTrajectory, Metrics, \
-    Observation, State, Task, Video
+    Observation, State, Task, Video, _Option
 
 
 class CogMan:
@@ -177,3 +181,113 @@ def _reset_policy(self, task: Task) -> None:
         else:
             self._current_policy = self._approach.solve(task,
                                                         timeout=CFG.timeout)
+
+
+def run_episode_and_get_observations(
+    cogman: CogMan,
+    env: BaseEnv,
+    train_or_test: str,
+    task_idx: int,
+    max_num_steps: int,
+    do_env_reset: bool = True,
+    terminate_on_goal_reached: bool = True,
+    exceptions_to_break_on: Optional[Set[TypingType[Exception]]] = None,
+    monitor: Optional[utils.LoggingMonitor] = None
+) -> Tuple[Tuple[List[Observation], List[Action]], bool, Metrics]:
+    """Execute cogman starting from the initial state of a train or test task
+    in the environment.
+
+    Note that the environment and cogman internal states are updated.
+    Terminates when any of these conditions hold: (1) cogman.step
+    returns None, indicating termination (2) max_num_steps is reached
+    (3) cogman or env raise an exception of type in
+    exceptions_to_break_on (4) terminate_on_goal_reached is True and the
+    env goal is reached. Note that in the case where the exception is
+    raised in step, we exclude the last action from the returned
+    trajectory to maintain the invariant that the trajectory states are
+    of length one greater than the actions. Ideally, this method would
+    live in utils.py, but that results in import errors with this file.
+    So we keep it here for now. It might be moved in the future.
+    """
+    if do_env_reset:
+        env.reset(train_or_test, task_idx)
+        if monitor is not None:
+            monitor.reset(train_or_test, task_idx)
+    obs = env.get_observation()
+    observations = [obs]
+    actions: List[Action] = []
+    curr_option: Optional[_Option] = None
+    metrics: Metrics = defaultdict(float)
+    metrics["policy_call_time"] = 0.0
+    metrics["num_options_executed"] = 0.0
+    exception_raised_in_step = False
+    if not (terminate_on_goal_reached and env.goal_reached()):
+        for _ in range(max_num_steps):
+            monitor_observed = False
+            exception_raised_in_step = False
+            try:
+                start_time = time.perf_counter()
+                act = cogman.step(obs)
+                metrics["policy_call_time"] += time.perf_counter() - start_time
+                if act is None:
+                    break
+                if act.has_option() and act.get_option() != curr_option:
+                    curr_option = act.get_option()
+                    metrics["num_options_executed"] += 1
+                # Note: it's important to call monitor.observe() before
+                # env.step(), because the monitor may, for example, call
+                # env.render(), which outputs images of the current env
+                # state. If we instead called env.step() first, we would
+                # mistakenly record images of the next time step instead of
+                # the current one.
+                if monitor is not None:
+                    monitor.observe(obs, act)
+                    monitor_observed = True
+                obs = env.step(act)
+                actions.append(act)
+                observations.append(obs)
+            except Exception as e:
+                if exceptions_to_break_on is not None and \
+                   any(issubclass(type(e), c) for c in exceptions_to_break_on):
+                    if monitor_observed:
+                        exception_raised_in_step = True
+                    break
+                if monitor is not None and not monitor_observed:
+                    monitor.observe(obs, None)
+                raise e
+            if terminate_on_goal_reached and env.goal_reached():
+                break
+    if monitor is not None and not exception_raised_in_step:
+        monitor.observe(obs, None)
+    cogman.finish_episode(obs)
+    traj = (observations, actions)
+    solved = env.goal_reached()
+    return traj, solved, metrics
+
+
+def run_episode_and_get_states(
+    cogman: CogMan,
+    env: BaseEnv,
+    train_or_test: str,
+    task_idx: int,
+    max_num_steps: int,
+    do_env_reset: bool = True,
+    terminate_on_goal_reached: bool = True,
+    exceptions_to_break_on: Optional[Set[TypingType[Exception]]] = None,
+    monitor: Optional[utils.LoggingMonitor] = None
+) -> Tuple[LowLevelTrajectory, bool, Metrics]:
+    """Execute cogman starting from the initial state of a train or test task
+    in the environment.
+
+    Return a trajectory involving States (which come from running a
+    perceiver on observations). Having states instead of observations is
+    useful for downstream learning (e.g. predicates, operators,
+    samplers, etc.) Note that the only difference between this and the
+    above run_episode_and_get_observations is that this method returns a
+    trajectory of states instead of one of observations.
+    """
+    _, solved, metrics = run_episode_and_get_observations(
+        cogman, env, train_or_test, task_idx, max_num_steps, do_env_reset,
+        terminate_on_goal_reached, exceptions_to_break_on, monitor)
+    ll_traj = cogman.get_current_history()
+    return ll_traj, solved, metrics
diff --git a/predicators/datasets/__init__.py b/predicators/datasets/__init__.py
index 6a5159886d..657d5ee471 100644
--- a/predicators/datasets/__init__.py
+++ b/predicators/datasets/__init__.py
@@ -7,16 +7,18 @@
 from predicators.datasets.demo_only import create_demo_data
 from predicators.datasets.demo_replay import create_demo_replay_data
 from predicators.datasets.generate_atom_trajs_with_vlm import \
-    create_ground_atom_data_from_img_trajs, \
-    create_ground_atom_data_from_labelled_txt
+    create_ground_atom_data_from_generated_demos, \
+    create_ground_atom_data_from_labelled_txt, \
+    create_ground_atom_data_from_saved_img_trajs
 from predicators.datasets.ground_atom_data import create_ground_atom_data
 from predicators.envs import BaseEnv
 from predicators.settings import CFG
-from predicators.structs import Dataset, ParameterizedOption, Task
+from predicators.structs import Dataset, ParameterizedOption, Predicate, Task
 
 
 def create_dataset(env: BaseEnv, train_tasks: List[Task],
-                   known_options: Set[ParameterizedOption]) -> Dataset:
+                   known_options: Set[ParameterizedOption],
+                   known_predicates: Set[Predicate]) -> Dataset:
     """Create offline datasets for training, given a set of training tasks for
     an environment.
 
@@ -43,16 +45,42 @@ def create_dataset(env: BaseEnv, train_tasks: List[Task],
         n = int(CFG.teacher_dataset_num_examples)
         assert n >= 1, "Must have at least 1 example of each predicate"
         return create_ground_atom_data(env, base_dataset, excluded_preds, n)
+    if CFG.offline_data_method == "demo_with_vlm_imgs":  # pragma: no cover  # pylint:disable=line-too-long
+        # NOTE: this below method is tested separately; it's just that testing
+        # it by calling the above function is painful because a VLM is
+        # instantiated and called from inside this method, but when testing,
+        # we want to instantiate our own 'dummy' VLM.
+        # NOTE: this data generation method is currently not compatible with
+        # option learning because it will modify dataset trajectories to
+        # remove a number of intermediate states when an option was being
+        # executed. Thus, we assert this before doing anything further.
+        assert CFG.option_learner == "no_learning", \
+            ("offline data method demo_with_vlm_imgs only compatible with the"
+            "'no_learning' option learner.")
+        # First, we call create_demo_data to create a dataset.
+        demo_data = create_demo_data(env,
+                                     train_tasks,
+                                     known_options,
+                                     annotate_with_gt_ops=False)
+        assert len(demo_data.trajectories) == len(train_tasks), (
+            "Cannot run "
+            "VLM-based predicate invention if we don't have one demo per "
+            "training task; ensure there are no failures in demonstration "
+            "generation.")
+        # Second, we add annotations to these trajectories by leveraging
+        # a VLM.
+        return create_ground_atom_data_from_generated_demos(
+            demo_data, env, known_predicates, train_tasks)
     if CFG.offline_data_method == "demo+labelled_atoms":
         return create_ground_atom_data_from_labelled_txt(
             env, train_tasks, known_options)
-    if CFG.offline_data_method == "img_demos":  # pragma: no cover.
+    if CFG.offline_data_method == "saved_vlm_img_demos_folder":  # pragma: no cover  # pylint:disable=line-too-long
         # NOTE: this below method is tested separately; it's just that testing
         # it by calling the above function is painful because a VLM is
         # instantiated and called from inside this method, but when testing,
         # we want to instantiate our own 'dummy' VLM.
-        return create_ground_atom_data_from_img_trajs(env, train_tasks,
-                                                      known_options)
+        return create_ground_atom_data_from_saved_img_trajs(
+            env, train_tasks, known_predicates, known_options)
     if CFG.offline_data_method == "empty":
         return Dataset([])
     raise NotImplementedError("Unrecognized dataset method.")
diff --git a/predicators/datasets/demo_only.py b/predicators/datasets/demo_only.py
index 397c39c1e2..dfa3bfc425 100644
--- a/predicators/datasets/demo_only.py
+++ b/predicators/datasets/demo_only.py
@@ -9,13 +9,15 @@
 import dill as pkl
 import matplotlib
 import matplotlib.pyplot as plt
-import numpy as np
 
 from predicators import utils
 from predicators.approaches import ApproachFailure, ApproachTimeout
 from predicators.approaches.oracle_approach import OracleApproach
+from predicators.cogman import CogMan, run_episode_and_get_states
 from predicators.envs import BaseEnv
+from predicators.execution_monitoring import create_execution_monitor
 from predicators.ground_truth_models import get_gt_options
+from predicators.perception import create_perceiver
 from predicators.settings import CFG
 from predicators.structs import Action, Dataset, LowLevelTrajectory, \
     ParameterizedOption, State, Task
@@ -132,6 +134,9 @@ def _generate_demonstrations(env: BaseEnv, train_tasks: List[Task],
     """Use the demonstrator to generate demonstrations, one per training task
     starting from train_tasks_start_idx."""
     if CFG.demonstrator == "oracle":
+        # Instantiate CogMan with the oracle approach (to be used as the
+        # demonstrator). This requires creating a perceiver and
+        # execution monitor according to settings from CFG.
         options = get_gt_options(env.get_name())
         oracle_approach = OracleApproach(
             env.predicates,
@@ -142,6 +147,9 @@ def _generate_demonstrations(env: BaseEnv, train_tasks: List[Task],
             task_planning_heuristic=CFG.offline_data_task_planning_heuristic,
             max_skeletons_optimized=CFG.offline_data_max_skeletons_optimized,
             bilevel_plan_without_sim=CFG.offline_data_bilevel_plan_without_sim)
+        perceiver = create_perceiver(CFG.perceiver)
+        execution_monitor = create_execution_monitor(CFG.execution_monitor)
+        cogman = CogMan(oracle_approach, perceiver, execution_monitor)
     else:  # pragma: no cover
         # Disable all built-in keyboard shortcuts.
         keymaps = {k for k in plt.rcParams if k.startswith("keymap.")}
@@ -154,14 +162,14 @@ def _generate_demonstrations(env: BaseEnv, train_tasks: List[Task],
     if annotate_with_gt_ops:
         annotations = []
     num_tasks = min(len(train_tasks), CFG.max_initial_demos)
-    rng = np.random.default_rng(CFG.seed)
-    if CFG.offline_data_bilevel_plan_without_sim is None:
-        bilevel_plan_without_sim = CFG.bilevel_plan_without_sim
-    else:
-        bilevel_plan_without_sim = CFG.offline_data_bilevel_plan_without_sim
     for idx, task in enumerate(train_tasks):
         if idx < train_tasks_start_idx:  # ignore demos before this index
             continue
+        if CFG.make_demo_videos or CFG.make_demo_images:
+            video_monitor = utils.VideoMonitor(env.render)
+        else:
+            video_monitor = None
+
         # Note: we assume in main.py that demonstrations are only generated
         # for train tasks whose index is less than CFG.max_initial_demos. If
         # you modify code around here, make sure that this invariant holds.
@@ -169,48 +177,41 @@ def _generate_demonstrations(env: BaseEnv, train_tasks: List[Task],
             break
         try:
             if CFG.demonstrator == "oracle":
-                timeout = CFG.offline_data_planning_timeout
-                if timeout == -1:
-                    timeout = CFG.timeout
-                oracle_approach.solve(task, timeout=timeout)
-                # Since we're running the oracle approach, we know that
-                # the policy is actually a plan under the hood, and we
-                # can retrieve it with get_last_plan(). We do this
-                # because we want to run the full plan.
-                if bilevel_plan_without_sim:
-                    last_nsrt_plan = oracle_approach.get_last_nsrt_plan()
-                    policy = utils.nsrt_plan_to_greedy_policy(
-                        last_nsrt_plan, task.goal, rng)
-                else:
-                    last_plan = oracle_approach.get_last_plan()
-                    policy = utils.option_plan_to_policy(last_plan)
-                # We will stop run_policy() when OptionExecutionFailure()
-                # is hit, which should only happen when the goal has been
-                # reached, as verified by the assertion later.
-                termination_function = lambda s: False
+                # In this case, we use the instantiated cogman to generate
+                # demonstrations. Importantly, we want to access state-action
+                # trajectories, not observation-action ones.
+                env_task = env.get_train_tasks()[idx]
+                cogman.reset(env_task)
+                traj, _, _ = run_episode_and_get_states(
+                    cogman,
+                    env,
+                    "train",
+                    idx,
+                    max_num_steps=CFG.horizon,
+                    exceptions_to_break_on={
+                        utils.OptionExecutionFailure,
+                        utils.HumanDemonstrationFailure,
+                    },
+                    monitor=video_monitor)
             else:  # pragma: no cover
+                # Otherwise, we get human input demos.
                 caption = (f"Task {idx+1} / {num_tasks}\nPlease demonstrate "
                            f"achieving the goal:\n{task.goal}")
                 policy = functools.partial(human_demonstrator_policy, env,
                                            caption, event_to_action)
                 termination_function = task.goal_holds
-
-            if CFG.make_demo_videos:
-                monitor = utils.VideoMonitor(env.render)
-            else:
-                monitor = None
-            traj, _ = utils.run_policy(
-                policy,
-                env,
-                "train",
-                idx,
-                termination_function=termination_function,
-                max_num_steps=CFG.horizon,
-                exceptions_to_break_on={
-                    utils.OptionExecutionFailure,
-                    utils.HumanDemonstrationFailure,
-                },
-                monitor=monitor)
+                traj, _ = utils.run_policy(
+                    policy,
+                    env,
+                    "train",
+                    idx,
+                    termination_function=termination_function,
+                    max_num_steps=CFG.horizon,
+                    exceptions_to_break_on={
+                        utils.OptionExecutionFailure,
+                        utils.HumanDemonstrationFailure,
+                    },
+                    monitor=video_monitor)
         except (ApproachTimeout, ApproachFailure,
                 utils.EnvironmentFailure) as e:
             logging.warning("WARNING: Approach failed to solve with error: "
@@ -245,10 +246,17 @@ def _generate_demonstrations(env: BaseEnv, train_tasks: List[Task],
             last_nsrt_plan = oracle_approach.get_last_nsrt_plan()
             annotations.append(list(last_nsrt_plan))
         if CFG.make_demo_videos:
-            assert monitor is not None
-            video = monitor.get_video()
+            assert video_monitor is not None
+            video = video_monitor.get_video()
             outfile = f"{CFG.env}__{CFG.seed}__demo__task{idx}.mp4"
             utils.save_video(outfile, video)
+        if CFG.make_demo_images:
+            assert video_monitor is not None
+            video = video_monitor.get_video()
+            width = len(str(len(train_tasks)))
+            task_number = str(idx).zfill(width)
+            outfile_prefix = f"{CFG.env}__{CFG.seed}__demo__task{task_number}"
+            utils.save_images(outfile_prefix, video)
     if annotate_with_gt_ops:
         dataset = Dataset(trajectories, annotations)
     else:
diff --git a/predicators/datasets/generate_atom_trajs_with_vlm.py b/predicators/datasets/generate_atom_trajs_with_vlm.py
index 5831696357..aeae8eeedc 100644
--- a/predicators/datasets/generate_atom_trajs_with_vlm.py
+++ b/predicators/datasets/generate_atom_trajs_with_vlm.py
@@ -5,17 +5,19 @@
 import logging
 import os
 import re
+from functools import partial
 from pathlib import Path
 from typing import Dict, List, Optional, Sequence, Set, Tuple
 
+import dill as pkl
 import numpy as np
 import PIL.Image
 
 from predicators import utils
 from predicators.envs import BaseEnv
-from predicators.envs.vlm_envs import DUMMY_GOAL_OBJ_NAME, VLMPredicateEnv
-from predicators.pretrained_model_interface import GoogleGeminiVLM, \
-    VisionLanguageModel
+from predicators.envs.vlm_envs import DUMMY_GOAL_OBJ_NAME
+from predicators.nsrt_learning.segmentation import _segment_with_option_changes
+from predicators.pretrained_model_interface import VisionLanguageModel
 from predicators.settings import CFG
 from predicators.structs import Action, Dataset, GroundAtom, \
     ImageOptionTrajectory, LowLevelTrajectory, Object, ParameterizedOption, \
@@ -63,7 +65,8 @@ def _generate_prompt_for_atom_proposals(
             (prompt, [traj.imgs[i][0] for i in range(len(traj.imgs))]))
     elif CFG.grammar_search_vlm_atom_proposal_prompt_type == \
         "options_labels_whole_traj":
-        prompt += "\n".join(act.name + str(sorted(act.objects))
+        prompt += "\nSkills executed in trajectory:\n"
+        prompt += "\n".join(act.name + str(act.objects)
                             for act in traj.actions)
         # NOTE: exact same issue as described in the above note for
         # naive_whole_traj.
@@ -180,7 +183,7 @@ def _parse_unique_atom_proposals_from_list(
         assert len(atoms_proposal_for_traj) == 1
         curr_atoms_proposal = atoms_proposal_for_traj[0]
         # Regex pattern to match predicates
-        atom_match_pattern = r"\b[a-z_]+\([a-z0-9, ]+\)"
+        atom_match_pattern = r"\b[a-z0-9_-]+\([a-z0-9_, -]+\)"
         # Find all matches in the text
         matches = re.findall(atom_match_pattern,
                              curr_atoms_proposal,
@@ -188,7 +191,7 @@ def _parse_unique_atom_proposals_from_list(
         for atom_proposal_txt in matches:
             num_atoms_considered += 1
             atom_is_valid = True
-            atom = re.sub(r"[^\w\s\(\),]", "", atom_proposal_txt).strip(' ')
+            atom = re.sub(r"[^\w\s\(\),_-]", "", atom_proposal_txt).strip(' ')
             obj_names = re.findall(r'\((.*?)\)', atom)
             if obj_names:
                 obj_names_list = [
@@ -206,7 +209,7 @@ def _parse_unique_atom_proposals_from_list(
     return atoms_strs_set
 
 
-def save_labelled_trajs_as_txt(
+def _save_labelled_trajs_as_txt(
         env: BaseEnv, labelled_atoms_trajs: List[List[str]],
         ground_option_trajs: List[List[_Option]]) -> None:
     """Save a txt file with a text representation of GroundAtomTrajectories.
@@ -230,8 +233,12 @@ def save_labelled_trajs_as_txt(
             curr_option_str = curr_option.name + "("
             for obj in curr_option.objects:
                 curr_option_str += str(obj.name) + ", "
-            curr_option_str = curr_option_str[:-2] + ")" + str(
-                curr_option.params.tolist()) + " -> "
+            if len(curr_option.objects) > 0:
+                curr_option_str = curr_option_str[:-2] + ")" + str(
+                    curr_option.params.tolist()) + " -> "
+            else:
+                curr_option_str = curr_option_str[:] + ")" + str(
+                    curr_option.params.tolist()) + " -> "
             save_str += curr_state_str + "\n\n" + curr_option_str + "\n\n"
         # At the end of the trajectory, we need to append the final state,
         # and a "===" delimiter.
@@ -247,10 +254,47 @@ def save_labelled_trajs_as_txt(
     logging.info(f"Human-readable labelled trajectory saved to {filepath}!")
 
 
+def _save_img_option_trajs_in_folder(
+        img_option_trajs: List[ImageOptionTrajectory]) -> None:
+    """Save a set of image option trajectories as a folder."""
+    data_dir_path = os.path.join(utils.get_path_to_predicators_root(),
+                                 CFG.data_dir)
+    base_folder_name = CFG.env + "__vlm_demos__" + str(CFG.seed) + "__" + str(
+        len(img_option_trajs))
+    base_folder_path = Path(data_dir_path, base_folder_name)
+    if not os.path.exists(base_folder_path):
+        os.makedirs(base_folder_path, exist_ok=False)
+        for i, img_option_traj in enumerate(img_option_trajs):
+            curr_traj_folder = Path(base_folder_path, "traj_" + str(i))
+            os.makedirs(curr_traj_folder, exist_ok=False)
+            for j, img_list in enumerate(img_option_traj.imgs):
+                curr_traj_timestep_folder = Path(curr_traj_folder, str(j))
+                os.makedirs(curr_traj_timestep_folder, exist_ok=False)
+                for k, img in enumerate(img_list):
+                    img.save(
+                        Path(curr_traj_timestep_folder,
+                             str(j) + "_" + str(k) + ".jpg"))
+                # Save the object-centric state alongside the images.
+                assert img_option_traj.states is not None
+                state_file = curr_traj_timestep_folder / "state.p"
+                with open(state_file, "wb") as f:
+                    pkl.dump(img_option_traj.states[j], f)
+            options_txt_file_path = Path(curr_traj_folder, "options_traj.txt")
+            options_txt_file_contents = ""
+            for opt in img_option_traj.actions:
+                options_txt_file_contents += opt.name + "(" + ", ".join(
+                    obj.name for obj in opt.objects) + ", [" + ", ".join(
+                        str(param) for param in opt.params.tolist()) + "])\n"
+            with open(options_txt_file_path, "w", encoding="utf-8") as f:
+                f.write(options_txt_file_contents)
+
+
 def _parse_structured_state_into_ground_atoms(
     env: BaseEnv,
     train_tasks: List[Task],
     structured_state_trajs: List[List[Dict[str, Dict[Tuple[str, ...], bool]]]],
+    state_trajs: Optional[List[List[State]]] = None,
+    known_predicates: Optional[Set[Predicate]] = None,
 ) -> List[List[Set[GroundAtom]]]:
     """Convert structured state trajectories into actual trajectories of ground
     atoms."""
@@ -258,24 +302,31 @@ def _parse_structured_state_into_ground_atoms(
     # Firstly, the number of train tasks must equal the number of structured
     # state demos we have.
     assert len(train_tasks) == len(structured_state_trajs)
-    # Secondly, we assume there is only one goal predicate, and that it is
-    # a dummy goal predicate.
-    assert len(env.goal_predicates) == 1
-    goal_preds_list = list(env.goal_predicates)
-    goal_predicate = goal_preds_list[0]
-    assert goal_predicate.name == "DummyGoal"
-    # We also assume that there is precisely one "object" type that is
+    # Next, we check whether the task goal is a 'DummyGoal'. In this case, we
+    # must be using an environment that hasn't been fully implemented (with
+    # proper goal predicates, options, etc.).
+    use_dummy_goal = False
+    if "DummyGoal" in str(train_tasks[0].goal):
+        # In this case, we assume there is only one goal predicate, and that it
+        # is a dummy goal predicate.
+        assert len(env.goal_predicates) == 1
+        goal_preds_list = list(env.goal_predicates)
+        goal_predicate = goal_preds_list[0]
+        assert goal_predicate.name == "DummyGoal"
+        use_dummy_goal = True
+
+    # We also check whether there is precisely one "object" type that is
     # a superset of all other object types.
     obj_type = None
     for t in env.types:
         obj_type = t.oldest_ancestor
-        assert obj_type.name == "object"
-    assert obj_type is not None
+        if obj_type.name != "object":
+            obj_type = None
+            break
 
-    def _stripped_classifier(
-            state: State,
-            objects: Sequence[Object]) -> bool:  # pragma: no cover.
-        raise Exception("Stripped classifier should never be called!")
+    def _get_vlm_query_str(pred_name: str, objects: Sequence[Object]) -> str:
+        return pred_name + "(" + ", ".join(
+            str(obj.name) for obj in objects) + ")"  # pragma: no cover
 
     pred_name_to_pred = {}
     atoms_trajs = []
@@ -285,14 +336,30 @@ def _stripped_classifier(
         curr_atoms_traj = []
         objs_for_task = set(train_tasks[i].init)
         curr_obj_name_to_obj = {obj.name: obj for obj in objs_for_task}
-        # NOTE: We assume that there is precisely one dummy object that is
-        # used to track whether the dummy goal has been reached or not.
-        assert DUMMY_GOAL_OBJ_NAME in curr_obj_name_to_obj
-        # Create a goal atom for this demonstration using the goal predicate.
-        goal_atom = GroundAtom(goal_predicate,
-                               [curr_obj_name_to_obj[DUMMY_GOAL_OBJ_NAME]])
-        for structured_state in traj:
+        # If we have states, then we can just evaluate the goal predicates on
+        # them. But if we don't, then there's nothing we can do except assume
+        # that there is only one goal atom that gets satisfied at the end.
+        assume_goal_holds_at_end = use_dummy_goal
+        if state_trajs is None:
+            assert len(train_tasks[i].goal) == 1
+            assert known_predicates is None or \
+                known_predicates.issubset(env.goal_predicates)
+            assume_goal_holds_at_end = True
+
+        if use_dummy_goal:
+            # NOTE: In this case, we assume that there is precisely one dummy
+            # object that is used to track whether the dummy goal has been
+            # reached or not.
+            assert DUMMY_GOAL_OBJ_NAME in curr_obj_name_to_obj
+
+        for j, structured_state in enumerate(traj):
+
             curr_ground_atoms_state = set()
+            if state_trajs is not None:
+                assert known_predicates is not None
+                curr_ground_atoms_state |= utils.abstract(
+                    state_trajs[i][j], known_predicates)
+
             for pred_name, objs_and_val_dict in structured_state.items():
                 # IMPORTANT NOTE: this currently assumes that the data is such
                 # that a predicate with a certain name (e.g. "Sliced")
@@ -302,6 +369,11 @@ def _stripped_classifier(
                 # check for this in the future.
                 if pred_name not in pred_name_to_pred:
                     if len(objs_and_val_dict.keys()) == 1:
+                        # NOTE: this below code doesn't do the right thing
+                        # when there are multiple of the predicate that
+                        # are true with different objects of the same type
+                        # (e.g. Covers(obj1, targ1) and Covers(obj2, targ2)).
+                        # We might want to do something about this.
                         # In this case, we make a predicate that takes in
                         # exactly one types argument.
                         for obj_args in objs_and_val_dict.keys():
@@ -311,8 +383,10 @@ def _stripped_classifier(
                             for obj_name in obj_args:
                                 curr_obj = curr_obj_name_to_obj[obj_name]
                                 pred_types.append(curr_obj.type)
-                            pred_name_to_pred[pred_name] = Predicate(
-                                pred_name, pred_types, _stripped_classifier)
+                            pred_name_to_pred[
+                                pred_name] = utils.create_vlm_predicate(
+                                    pred_name, pred_types,
+                                    partial(_get_vlm_query_str, pred_name))
                     else:
                         # In this case, we need to make a predicate that
                         # takes in the generic 'object' type such that
@@ -328,9 +402,15 @@ def _stripped_classifier(
                                 assert num_args == len(obj_args)
                         # Given this, add one new predicate with num_args
                         # number of 'object' type arguments.
-                        pred_name_to_pred[pred_name] = Predicate(
-                            pred_name, [obj_type for _ in range(num_args)],
-                            _stripped_classifier)
+                        assert obj_type is not None, (
+                            "VLM atom parsing "
+                            "failure; please add an 'object' type to your "
+                            "environment that is a supertype of all other "
+                            "types.")
+                        pred_name_to_pred[
+                            pred_name] = utils.create_vlm_predicate(
+                                pred_name, [obj_type for _ in range(num_args)],
+                                partial(_get_vlm_query_str, pred_name))
 
                 # Given that we've now built up predicates and object
                 # dictionaries. We can now convert the current state into
@@ -342,8 +422,8 @@ def _stripped_classifier(
                                 pred_name_to_pred[pred_name],
                                 [curr_obj_name_to_obj[o] for o in obj_args]))
             curr_atoms_traj.append(curr_ground_atoms_state)
-        # Add the goal atom at the end of the trajectory.
-        curr_atoms_traj[-1].add(goal_atom)
+        if assume_goal_holds_at_end:
+            curr_atoms_traj[-1] |= train_tasks[i].goal
         atoms_trajs.append(curr_atoms_traj)
     return atoms_trajs
 
@@ -369,6 +449,7 @@ def _parse_structured_actions_into_ground_options(
             ground_option = option.ground([
                 curr_obj_name_to_obj[obj_name]
                 for obj_name in structured_action[1]
+                if len(obj_name.strip()) > 0
             ], np.array(structured_action[2]))
             # Call initiable here because we will be calling
             # terminal later, and initiable always needs
@@ -455,7 +536,7 @@ def _parse_options_txt_into_structured_actions(
     structured set of tuples suitable for later conversion into more structured
     GroundAtomTrajectories."""
     structured_actions_output = []
-    pattern_option = r'(\w+)\(([^)]*)\)\[([\d.,\s]*)\] ->'
+    pattern_option = r'(\w+)\(([^)]*)\)\[([\d.,\s-]*)\] ->'
     option_matches = re.findall(pattern_option, text)
     for i in range(len(option_matches)):
         current_option_with_objs = (option_matches[i][0],
@@ -548,6 +629,144 @@ def _parse_vlmtraj_file_into_structured_trajs(
     return (output_state_trajs, output_action_trajs)
 
 
+def _query_vlm_to_generate_ground_atoms_trajs(
+        image_option_trajs: List[ImageOptionTrajectory], env: BaseEnv,
+        train_tasks: List[Task], known_predicates: Set[Predicate],
+        all_task_objs: Set[Object],
+        vlm: VisionLanguageModel) -> List[List[Set[GroundAtom]]]:
+    """Given a collection of ImageOptionTrajectories, query a VLM to convert
+    these into ground atom trajectories."""
+    if not CFG.grammar_search_vlm_atom_proposal_use_debug:
+        logging.info("Querying VLM for candidate atom proposals...")
+        atom_strs_proposals_list = _sample_vlm_atom_proposals_from_trajectories(
+            image_option_trajs, vlm, 1)
+        logging.info("Done querying VLM for candidate atoms!")
+        # We now parse and sanitize this set of atoms.
+        atom_proposals_set = _parse_unique_atom_proposals_from_list(
+            atom_strs_proposals_list, all_task_objs)
+    else:  # pragma: no cover.
+        atom_proposals_set = env.get_vlm_debug_atom_strs(train_tasks)
+    assert len(atom_proposals_set) > 0, "Atom proposals set is empty!"
+    # Given this set of unique atom proposals, we now ask the VLM
+    # to label these in every scene from the demonstrations.
+    # NOTE: we convert to a sorted list here to get rid of randomness from set
+    # ordering.
+    unique_atoms_list = sorted(atom_proposals_set)
+    # Now, query the VLM!
+    logging.info("Querying VLM to label every scene...")
+    atom_labels = _label_trajectories_with_vlm_atom_values(
+        image_option_trajs, vlm, unique_atoms_list)
+    logging.info("Done querying VLM for scene labelling!")
+    # Save the output as a human-readable txt file.
+    _save_labelled_trajs_as_txt(
+        env, atom_labels, [io_traj.actions for io_traj in image_option_trajs])
+    # Now, parse this information into a Dataset!
+    # Start by converting all the labelled atoms into a more structured
+    # dict. This requires each set of labelled atoms text to be enclosed
+    # by curly brackets.
+    structured_state_trajs = []
+    state_trajs: Optional[List[List[State]]] = []
+    for atom_traj, io_traj in zip(atom_labels, image_option_trajs,
+                                  strict=True):
+        atoms_txt_strs = [
+            '{' + curr_ts_atoms_txt + '}' for curr_ts_atoms_txt in atom_traj
+        ]
+        full_traj_atoms_str = '\n\n'.join(atoms_txt_strs)
+        structured_state_trajs.append(
+            _parse_atoms_txt_into_structured_state(full_traj_atoms_str))
+        if io_traj.states:
+            assert state_trajs is not None
+            state_trajs.append(io_traj.states)
+        else:
+            state_trajs = None
+    # Given this, we now convert each trajectory consisting of a series of
+    # structured states into a trajectory of GroundAtoms.
+    ground_atoms_trajs = _parse_structured_state_into_ground_atoms(
+        env,
+        train_tasks,
+        structured_state_trajs,
+        state_trajs=state_trajs,
+        known_predicates=known_predicates)
+    _debug_log_atoms_trajs(ground_atoms_trajs)
+    return ground_atoms_trajs
+
+
+def create_ground_atom_data_from_generated_demos(
+        dataset: Dataset,
+        env: BaseEnv,
+        known_predicates: Set[Predicate],
+        train_tasks: List[Task],
+        vlm: Optional[VisionLanguageModel] = None) -> Dataset:
+    """Given an input dataset that's been generated from one of our
+    environments, run the VLM on the images associated with the dataset. Return
+    a potentially *smaller* dataset that is annotated with VLM atom values.
+
+    Why are we returning a potentially-smaller dataset here? For many of
+    our environments, option executions result in multiple actions (not
+    just one). However, we only want to call the VLM to label the
+    state(s) before and after option execution. Thus, we first segment
+    the dataset.trajectories via option. We pick out the states before
+    and after each option execution to call the VLM on. We also replace
+    the trajectory actions with dummy zero actions that have the right
+    option. Thus, the dataset returned by this function is not suitable
+    for option learning (it's really intended for a setting where
+    options are known).
+    """
+    # We start by converting trajectories into ImageOptionTrajectories
+    # that we can feed to a VLM.
+    img_option_trajs: List[ImageOptionTrajectory] = []
+    all_task_objs: Set[Object] = set()
+    option_segmented_trajs: List[LowLevelTrajectory] = []
+    for traj in dataset.trajectories:
+        # We segment using option changes, which implicitly assumes that
+        # each action in the trajectory is linked to an option that isn't
+        # None.
+        segments = _segment_with_option_changes(traj, set(), None)
+        curr_traj_states_for_vlm: List[State] = []
+        curr_traj_actions_for_vlm: List[Action] = []
+        for segment in segments:
+            curr_traj_states_for_vlm.append(segment.states[0])
+            curr_traj_actions_for_vlm.append(segment.actions[0])
+        # We manually add the final two states (initial state and terminal
+        # state of the final option).
+        curr_traj_states_for_vlm.append(traj.states[-1])
+        # Pull out the images within the states we've saved for the trajectory.
+        # We assume that images are saved in the state's simulator_state
+        # field.
+        state_imgs: List[List[PIL.Image.Image]] = []
+        for state in curr_traj_states_for_vlm:
+            assert isinstance(state.simulator_state, List)
+            assert len(state.simulator_state) > 0
+            state_imgs.append([
+                PIL.Image.fromarray(img_arr)  # type: ignore
+                for img_arr in state.simulator_state
+            ])
+        img_option_trajs.append(
+            ImageOptionTrajectory(
+                set(traj.states[0]), state_imgs,
+                [act.get_option() for act in curr_traj_actions_for_vlm],
+                curr_traj_states_for_vlm, True, traj.train_task_idx))
+        option_segmented_trajs.append(
+            LowLevelTrajectory(curr_traj_states_for_vlm, [
+                Action(np.zeros(act.arr.shape, dtype=float), act.get_option())
+                for act in curr_traj_actions_for_vlm
+            ], True, traj.train_task_idx))
+        all_task_objs |= set(traj.states[0])
+        # We assume that all the input trajectories are such that the train task
+        # goal is achieved. Verify this before proceeding.
+        assert train_tasks[traj.train_task_idx].goal_holds(traj.states[-1])
+    # Save the trajectories in a folder so they can be loaded and re-labelled
+    # later.
+    _save_img_option_trajs_in_folder(img_option_trajs)
+    # Now, given these trajectories, we can query the VLM.
+    if vlm is None:
+        vlm = utils.create_vlm_by_name(CFG.vlm_model_name)  # pragma: no cover
+    ground_atoms_trajs = _query_vlm_to_generate_ground_atoms_trajs(
+        img_option_trajs, env, train_tasks, known_predicates, all_task_objs,
+        vlm)
+    return Dataset(option_segmented_trajs, ground_atoms_trajs)
+
+
 def create_ground_atom_data_from_labelled_txt(
         env: BaseEnv, train_tasks: List[Task],
         known_options: Set[ParameterizedOption]) -> Dataset:
@@ -566,18 +785,42 @@ def create_ground_atom_data_from_labelled_txt(
     _debug_log_atoms_trajs(ground_atoms_trajs)
     option_trajs = _parse_structured_actions_into_ground_options(
         structured_actions, known_options, train_tasks)
-    # We also need to create the goal state for every train task.
-    goal_states_for_every_traj = _create_dummy_goal_state_for_each_task(
-        env, train_tasks)
-    # Finally, we need to construct actual LowLevelTrajectories.
-    low_level_trajs = _convert_ground_option_trajs_into_lowleveltrajs(
-        option_trajs, goal_states_for_every_traj, train_tasks)
+    # Finally, we just need to construct LowLevelTrajectories that we can
+    # output as part of our Dataset.
+    goal_states_for_every_traj = None
+    if "DummyGoal" in str(train_tasks[0].goal):
+        # Now, we just need to create a goal state for every train task
+        # where the dummy goal predicate holds. This is just bookkeeping
+        # necessary for NSRT learning and planning such that the goal
+        # doesn't hold in the initial state and holds in the final state of
+        # each demonstration trajectory.
+        goal_states_for_every_traj = _create_dummy_goal_state_for_each_task(
+            env, train_tasks)
+        # Finally, we need to construct actual LowLevelTrajectories.
+        low_level_trajs = _convert_ground_option_trajs_into_lowleveltrajs(
+            option_trajs, goal_states_for_every_traj, train_tasks)
+    else:
+        low_level_trajs = []
+        for i, opt_traj in enumerate(option_trajs):
+            states = [train_tasks[i].init for _ in range(len(opt_traj))]
+            states.append(train_tasks[i].init)
+            actions = [
+                Action(np.zeros(env.action_space.shape, dtype=np.float32), opt)
+                for opt in opt_traj
+            ]
+            # NOTE: we're making an assumption here that the train_task_idx
+            # of the LowLevelTrajectory is i. This may not be true and
+            # is probably something we should deal with in a principled way
+            # in the future.
+            low_level_trajs.append(LowLevelTrajectory(states, actions, True,
+                                                      i))
     return Dataset(low_level_trajs, ground_atoms_trajs)
 
 
-def create_ground_atom_data_from_img_trajs(
+def create_ground_atom_data_from_saved_img_trajs(
         env: BaseEnv,
         train_tasks: List[Task],
+        known_predicates: Set[Predicate],
         known_options: Set[ParameterizedOption],
         vlm: Optional[VisionLanguageModel] = None) -> Dataset:
     """Given a folder containing trajectories that have images of scenes for
@@ -593,17 +836,23 @@ def create_ground_atom_data_from_img_trajs(
     assert folder_name_components[1] == "vlm_demos"
     assert int(folder_name_components[2]) == CFG.seed
     assert int(folder_name_components[3]) == CFG.num_train_tasks
-    num_trajs = len(os.listdir(trajectories_folder_path))
+    unfiltered_files = os.listdir(trajectories_folder_path)
+    # Each demonstration trajectory is in subfolder traj_<demo_number>.
+    traj_folders = [f for f in unfiltered_files if f[0:5] == "traj_"]
+    num_trajs = len(traj_folders)
     assert num_trajs == CFG.num_train_tasks
     option_name_to_option = {opt.name: opt for opt in known_options}
     image_option_trajs = []
     all_task_objs = set()
-    for train_task_idx, path in enumerate(
-            sorted(Path(trajectories_folder_path).iterdir())):
+    unfiltered_paths = sorted(Path(trajectories_folder_path).iterdir())
+    # Each demonstration trajectory is in subfolder traj_<demo_number>.
+    filtered_paths = [f for f in unfiltered_paths if "traj_" in f.parts[-1]]
+    for train_task_idx, path in enumerate(filtered_paths):
         assert path.is_dir()
         state_folders = [f.path for f in os.scandir(path) if f.is_dir()]
         num_states_in_traj = len(state_folders)
-        state_traj = []
+        img_traj = []
+        state_traj: Optional[List[State]] = []
         for state_num in range(num_states_in_traj):
             curr_imgs = []
             curr_state_path = path.joinpath(str(state_num))
@@ -611,7 +860,15 @@ def create_ground_atom_data_from_img_trajs(
             img_files = sorted(glob.glob(str(curr_state_path) + "/*.jpg"))
             for img in img_files:
                 curr_imgs.append(PIL.Image.open(img))
-            state_traj.append(curr_imgs)
+            img_traj.append(curr_imgs)
+            state_file = curr_state_path / "state.p"
+            if state_file.exists():  # pragma: no cover
+                with open(state_file, "rb") as fp:
+                    state = pkl.load(fp)
+                assert state_traj is not None
+                state_traj.append(state)
+            else:
+                state_traj = None
         # Get objects from train tasks to be used for future parsing.
         curr_train_task = train_tasks[train_task_idx]
         curr_task_objs = set(curr_train_task.init)
@@ -624,11 +881,24 @@ def create_ground_atom_data_from_img_trajs(
         with open(options_traj_file, "r", encoding="utf-8") as f:
             options_file_str = f.read()
         option_names_list = re.findall(r'(\w+)\(', options_file_str)
-        parsed_str_objects = re.findall(r'\((.*?)\)', options_file_str)
-        object_args_list = [obj.split(', ') for obj in parsed_str_objects]
-        # Remove empty square brackets from the object_args_list.
-        for object_arg_sublist in object_args_list:
-            object_arg_sublist.remove('[]')
+        option_args_strs = re.findall(r'\((.*?)\)', options_file_str)
+        parsed_str_objects = [
+            re.sub(r'\[[^\]]*\]', '', option_args_str).strip()
+            for option_args_str in option_args_strs
+        ]
+        objects_exist = len(''.join(obj_str
+                                    for obj_str in parsed_str_objects)) > 0
+        object_args_list: List[List[str]] = [
+            [] for _ in range(len(parsed_str_objects))
+        ]
+        if objects_exist:
+            cleaned_parsed_str_objects = [
+                obj_str[:-1] if obj_str[-1] == "," else obj_str
+                for obj_str in parsed_str_objects
+            ]
+            object_args_list = [
+                obj.split(', ') for obj in cleaned_parsed_str_objects
+            ]
         parameters = [
             ast.literal_eval(obj) if obj else []
             for obj in re.findall(r'\[(.*?)\]', options_file_str)
@@ -642,72 +912,50 @@ def create_ground_atom_data_from_img_trajs(
                 for opt_arg in option_objs_strs_list
             ]
             option = option_name_to_option[option_name]
-            ground_option = option.ground(objects, np.array(option_params))
-            # NOTE: we assert the option was initiable in the env's initial
-            # state because during learning, we will assert that the option's
-            # initiable function was previously called.
+            if isinstance(option_params, float):
+                params_tuple = (option_params, )
+            else:
+                params_tuple = option_params
+            ground_option = option.ground(objects, np.array(params_tuple))
             assert ground_option.initiable(curr_train_task.init)
             ground_option_traj.append(ground_option)
         # Given ground options, we can finally make ImageOptionTrajectories.
         image_option_trajs.append(
-            ImageOptionTrajectory(list(curr_task_objs), state_traj,
-                                  ground_option_traj, True, train_task_idx))
+            ImageOptionTrajectory(list(curr_task_objs),
+                                  img_traj,
+                                  ground_option_traj,
+                                  state_traj,
+                                  _is_demo=True,
+                                  _train_task_idx=train_task_idx))
     # Given trajectories, we can now query the VLM to get proposals for ground
     # atoms that might be relevant to decision-making.
     if vlm is None:
-        vlm = GoogleGeminiVLM(CFG.vlm_model_name)  # pragma: no cover.
-
-    if not CFG.grammar_search_vlm_atom_proposal_use_debug:
-        logging.info("Querying VLM for candidate atom proposals...")
-        atom_strs_proposals_list = _sample_vlm_atom_proposals_from_trajectories(
-            image_option_trajs, vlm, 1)
-        logging.info("Done querying VLM for candidate atoms!")
-        # We now parse and sanitize this set of atoms.
-        atom_proposals_set = _parse_unique_atom_proposals_from_list(
-            atom_strs_proposals_list, all_task_objs)
-    else:  # pragma: no cover.
-        assert isinstance(env, VLMPredicateEnv)
-        atom_proposals_set = env.vlm_debug_atom_strs
-    assert len(atom_proposals_set) > 0, "Atom proposals set is empty!"
-    # Given this set of unique atom proposals, we now ask the VLM
-    # to label these in every scene from the demonstrations.
-    # NOTE: we convert to a sorted list here to get rid of randomness from set
-    # ordering.
-    unique_atoms_list = sorted(atom_proposals_set)
-    # Now, query the VLM!
-    logging.info("Querying VLM to label every scene...")
-    atom_labels = _label_trajectories_with_vlm_atom_values(
-        image_option_trajs, vlm, unique_atoms_list)
-    logging.info("Done querying VLM for scene labelling!")
-    # Save the output as a human-readable txt file.
-    save_labelled_trajs_as_txt(
-        env, atom_labels, [io_traj.actions for io_traj in image_option_trajs])
-    # Now, parse this information into a Dataset!
-    # Start by converting all the labelled atoms into a more structured
-    # dict. This requires each set of labelled atoms text to be enclosed
-    # by curly brackets.
-    structured_state_trajs = []
-    for atom_traj in atom_labels:
-        atoms_txt_strs = [
-            '{' + curr_ts_atoms_txt + '}' for curr_ts_atoms_txt in atom_traj
-        ]
-        full_traj_atoms_str = '\n\n'.join(atoms_txt_strs)
-        structured_state_trajs.append(
-            _parse_atoms_txt_into_structured_state(full_traj_atoms_str))
-    # Given this, we now convert each trajectory consisting of a series of
-    # structured states into a trajectory of GroundAtoms.
-    ground_atoms_trajs = _parse_structured_state_into_ground_atoms(
-        env, train_tasks, structured_state_trajs)
-    _debug_log_atoms_trajs(ground_atoms_trajs)
-    # Now, we just need to create a goal state for every train task where
-    # the dummy goal predicate holds. This is just bookkeeping necessary
-    # for NSRT learning and planning such that the goal doesn't hold
-    # in the initial state and holds in the final state of each demonstration
-    # trajectory.
-    goal_states_for_every_traj = _create_dummy_goal_state_for_each_task(
-        env, train_tasks)
-    # Finally, we need to construct actual LowLevelTrajectories.
-    low_level_trajs = _convert_ground_option_trajs_into_lowleveltrajs(
-        [traj.actions for traj in image_option_trajs],
-        goal_states_for_every_traj, train_tasks)
+        vlm = utils.create_vlm_by_name(CFG.vlm_model_name)  # pragma: no cover
+    ground_atoms_trajs = _query_vlm_to_generate_ground_atoms_trajs(
+        image_option_trajs, env, train_tasks, known_predicates, all_task_objs,
+        vlm)
+    # Finally, we just need to construct LowLevelTrajectories that we can
+    # output as part of our Dataset.
+    goal_states_for_every_traj = None
+    if "DummyGoal" in str(train_tasks[0].goal):
+        # Now, we just need to create a goal state for every train task
+        # where the dummy goal predicate holds. This is just bookkeeping
+        # necessary for NSRT learning and planning such that the goal
+        # doesn't hold in the initial state and holds in the final state of
+        # each demonstration trajectory.
+        goal_states_for_every_traj = _create_dummy_goal_state_for_each_task(
+            env, train_tasks)
+        # Finally, we need to construct actual LowLevelTrajectories.
+        low_level_trajs = _convert_ground_option_trajs_into_lowleveltrajs(
+            [traj.actions for traj in image_option_trajs],
+            goal_states_for_every_traj, train_tasks)
+    else:
+        low_level_trajs = []
+        for io_traj in image_option_trajs:
+            assert io_traj.states is not None
+            low_level_trajs.append(
+                LowLevelTrajectory(io_traj.states, [
+                    Action(np.zeros(env.action_space.shape, dtype=np.float32),
+                           act) for act in io_traj.actions
+                ], True, io_traj.train_task_idx))
     return Dataset(low_level_trajs, ground_atoms_trajs)
diff --git a/predicators/datasets/vlm_input_data_prompts/atom_labelling/per_scene_naive.txt b/predicators/datasets/vlm_input_data_prompts/atom_labelling/per_scene_naive.txt
index 45ecfebcd6..b205083259 100644
--- a/predicators/datasets/vlm_input_data_prompts/atom_labelling/per_scene_naive.txt
+++ b/predicators/datasets/vlm_input_data_prompts/atom_labelling/per_scene_naive.txt
@@ -1,2 +1,2 @@
-You are a vision system for a robot. Your job is to output the values of the following predicates based on the provided visual scene. For each predicate, output True, False, or Unknown if the relevant objects are not in the scene or the value of the predicate simply cannot be determined. Output each predicate value as a bulleted list with each predicate and value on a different line. Use the format: <predicate>: <truth_value>. Ensure there is a period ('.') after every list item. Do not output any text except the names and truth values of predicates.
+You are a vision system for a robot. Your job is to output the values of the following predicates based on the provided visual scene. For each predicate, output True, False, or Unknown if the relevant objects are not in the scene or the value of the predicate simply cannot be determined. Output each predicate value as a bulleted list (use '*' for the bullets) with each predicate and value on a different line. Use the format: `<predicate>(<obj1>, <obj2>, ...): <truth_value>` given the provided predicates and arguments exactly. Ensure there is a period ('.') after every list item. Do not output any text except the names and truth values of predicates.
 Predicates:
\ No newline at end of file
diff --git a/predicators/datasets/vlm_input_data_prompts/atom_proposal/options_labels_whole_traj.txt b/predicators/datasets/vlm_input_data_prompts/atom_proposal/options_labels_whole_traj.txt
index 9532669d5f..0dddfe7001 100644
--- a/predicators/datasets/vlm_input_data_prompts/atom_proposal/options_labels_whole_traj.txt
+++ b/predicators/datasets/vlm_input_data_prompts/atom_proposal/options_labels_whole_traj.txt
@@ -1 +1 @@
-You are a robotic vision system whose job is to output a structured set of predicates useful for describing important concepts in the following demonstration of a task. You will be provided with a list of actions used during the task, as well as images of states before and after every action execution. Please provide predicates in terms of the following objects: {objs}. For each predicate, output it in the following format: predicate_name(obj1, obj2, obj3...). Start by generating predicates that change before and after each action. After this, generate any other predicates that perhaps do not change but are still important to describing the demonstration shown.
\ No newline at end of file
+You are a robotic vision system whose job is to output a structured set of predicates useful for describing important concepts in the following demonstration of a task. You will be provided with a list of actions used during the task, as well as images of states before and after every action execution. Please provide predicates in terms of the following objects: {objs}. For each predicate, output it in the following format: predicate_name(obj1, obj2, obj3...). Start by generating predicates that change before and after each action. After this, generate any other predicates that perhaps do not change but are still important to describing the demonstration shown. For each predicate you generate, also generate predicates that are synonyms and antonyms so that any predicate that is even tangentially relevant to the demonstrations is generated.
\ No newline at end of file
diff --git a/predicators/envs/assets/imgs/bottom_bun.png b/predicators/envs/assets/imgs/bottom_bun.png
new file mode 100644
index 0000000000..e9c30b7683
Binary files /dev/null and b/predicators/envs/assets/imgs/bottom_bun.png differ
diff --git a/predicators/envs/assets/imgs/cheese.png b/predicators/envs/assets/imgs/cheese.png
new file mode 100644
index 0000000000..b0c2be9b25
Binary files /dev/null and b/predicators/envs/assets/imgs/cheese.png differ
diff --git a/predicators/envs/assets/imgs/cooked_patty.png b/predicators/envs/assets/imgs/cooked_patty.png
new file mode 100644
index 0000000000..f94525408e
Binary files /dev/null and b/predicators/envs/assets/imgs/cooked_patty.png differ
diff --git a/predicators/envs/assets/imgs/cutting_board.png b/predicators/envs/assets/imgs/cutting_board.png
new file mode 100644
index 0000000000..6f3043cebe
Binary files /dev/null and b/predicators/envs/assets/imgs/cutting_board.png differ
diff --git a/predicators/envs/assets/imgs/floorwood.png b/predicators/envs/assets/imgs/floorwood.png
new file mode 100644
index 0000000000..48c5e4698d
Binary files /dev/null and b/predicators/envs/assets/imgs/floorwood.png differ
diff --git a/predicators/envs/assets/imgs/grill.png b/predicators/envs/assets/imgs/grill.png
new file mode 100644
index 0000000000..46d23d0cdb
Binary files /dev/null and b/predicators/envs/assets/imgs/grill.png differ
diff --git a/predicators/envs/assets/imgs/raw_patty.png b/predicators/envs/assets/imgs/raw_patty.png
new file mode 100644
index 0000000000..520d113d97
Binary files /dev/null and b/predicators/envs/assets/imgs/raw_patty.png differ
diff --git a/predicators/envs/assets/imgs/robot.png b/predicators/envs/assets/imgs/robot.png
new file mode 100644
index 0000000000..964af462d4
Binary files /dev/null and b/predicators/envs/assets/imgs/robot.png differ
diff --git a/predicators/envs/assets/imgs/robot_down.png b/predicators/envs/assets/imgs/robot_down.png
new file mode 100644
index 0000000000..20f3857eef
Binary files /dev/null and b/predicators/envs/assets/imgs/robot_down.png differ
diff --git a/predicators/envs/assets/imgs/robot_left.png b/predicators/envs/assets/imgs/robot_left.png
new file mode 100644
index 0000000000..bea06db276
Binary files /dev/null and b/predicators/envs/assets/imgs/robot_left.png differ
diff --git a/predicators/envs/assets/imgs/robot_right.png b/predicators/envs/assets/imgs/robot_right.png
new file mode 100644
index 0000000000..57e6794c94
Binary files /dev/null and b/predicators/envs/assets/imgs/robot_right.png differ
diff --git a/predicators/envs/assets/imgs/robot_up.png b/predicators/envs/assets/imgs/robot_up.png
new file mode 100644
index 0000000000..b34f1a2d65
Binary files /dev/null and b/predicators/envs/assets/imgs/robot_up.png differ
diff --git a/predicators/envs/assets/imgs/sliced_tomato.png b/predicators/envs/assets/imgs/sliced_tomato.png
new file mode 100644
index 0000000000..7e01e22191
Binary files /dev/null and b/predicators/envs/assets/imgs/sliced_tomato.png differ
diff --git a/predicators/envs/assets/imgs/top_bun.png b/predicators/envs/assets/imgs/top_bun.png
new file mode 100644
index 0000000000..d684f23a8e
Binary files /dev/null and b/predicators/envs/assets/imgs/top_bun.png differ
diff --git a/predicators/envs/assets/imgs/whole_tomato.png b/predicators/envs/assets/imgs/whole_tomato.png
new file mode 100644
index 0000000000..5b967b40b8
Binary files /dev/null and b/predicators/envs/assets/imgs/whole_tomato.png differ
diff --git a/predicators/envs/base_env.py b/predicators/envs/base_env.py
index 931c718245..950b1cb714 100644
--- a/predicators/envs/base_env.py
+++ b/predicators/envs/base_env.py
@@ -14,8 +14,8 @@
 from predicators.pretrained_model_interface import OpenAILLM
 from predicators.settings import CFG
 from predicators.structs import Action, DefaultEnvironmentTask, \
-    EnvironmentTask, GroundAtom, Object, Observation, Predicate, State, Type, \
-    Video
+    EnvironmentTask, GroundAtom, Object, Observation, Predicate, State, Task, \
+    Type, Video
 
 
 class BaseEnv(abc.ABC):
@@ -426,3 +426,25 @@ def get_observation(self) -> Observation:
         """Get the current observation of this environment."""
         assert isinstance(self._current_observation, State)
         return self._current_observation.copy()
+
+    def get_vlm_debug_atom_strs(self, train_tasks: List[Task]) -> Set[str]:
+        """A 'debug grammar' set of predicates that should be sufficient for
+        completing the task; useful for comparing different methods of VLM
+        truth-value labelling given the same set of atom proposals to label.
+
+        For the BaseEnv, this method simply takes the names of all
+        excluded predicates and uses these (i.e., forcing the VLM to
+        learn a classifier for these predicates). Subclasses can
+        override to handle more specific use cases.
+        """
+        _, excluded_preds = utils.parse_config_excluded_predicates(self)
+        all_ground_atoms_set: Set[GroundAtom] = set()
+        for tt in train_tasks:
+            all_ground_atoms_set |= set(
+                utils.all_possible_ground_atoms(tt.init, excluded_preds))
+        atom_strs = {
+            atom.predicate.name + "(" +
+            ", ".join([o.name for o in atom.objects]) + ")"
+            for atom in sorted(all_ground_atoms_set)
+        }
+        return atom_strs
diff --git a/predicators/envs/burger.py b/predicators/envs/burger.py
new file mode 100644
index 0000000000..fb64f707e6
--- /dev/null
+++ b/predicators/envs/burger.py
@@ -0,0 +1,638 @@
+"""A simple gridworld environment where a robot prepares a burger, inspired by
+https://github.com/portal-cornell/robotouille.
+
+This environment uses assets from robotouille that were designed by
+Nicole Thean (https://github.com/nicolethean).
+"""
+
+import logging
+from typing import Callable, List, Optional, Sequence, Set, Tuple
+
+import matplotlib
+import matplotlib.image as mpimg
+import matplotlib.pyplot as plt
+# import pygame
+import numpy as np
+from gym.spaces import Box
+
+from predicators import utils
+from predicators.envs import BaseEnv
+from predicators.settings import CFG
+from predicators.structs import Action, EnvironmentTask, GroundAtom, Object, \
+    Predicate, State, Type
+
+
+class BurgerEnv(BaseEnv):
+    """A simple gridworld environment where a robot prepares a burger, inspired
+    by https://github.com/portal-cornell/robotouille.
+
+    This environment is designed to showcase a predicate invention approach that
+    learns geometric predicates that operate on the object-centric state and
+    vision-language model predicates that operate on the visual rendering of the
+    state.
+
+    One quirk of this environment is that we want certain parts of the state to
+    only be accessible by the oracle approach. This is because we want to invent
+    predicates like IsCooked as a VLM predicate, but not a geometric predicate,
+    so no information about how cooked an object is should be in the state. The
+    solution to this is to hide certain state inside State.simulator_state.
+    After the demonstrations are created by the oracle approach, we can erase
+    this simulator state before we pass the demonstrations to the predicate
+    invention approach.
+
+    Example command to see demos created:
+    python predicators/main.py --env burger
+    --approach grammar_search_invention --seed 0 --num_train_tasks 10
+    --option_model_terminate_on_repeat False
+    --sesame_max_skeletons_optimized 1000 --timeout 80
+    --sesame_max_samples_per_step 1 --make_demo_videos
+    --sesame_task_planner fdopt
+
+    Note that the default task planner is too slow -- fast downward is required.
+    """
+
+    # Types
+    _object_type = Type("object", [])
+    _item_type = Type("item", [], _object_type)
+    _station_type = Type("station", [], _object_type)
+
+    _robot_type = Type("robot", ["row", "col", "fingers", "dir"])
+
+    _patty_type = Type("patty", ["row", "col", "z"], _item_type)
+    _tomato_type = Type("tomato", ["row", "col", "z"], _item_type)
+    _cheese_type = Type("cheese", ["row", "col", "z"], _item_type)
+    _bottom_bun_type = Type("bottom_bun", ["row", "col", "z"], _item_type)
+    _top_bun_type = Type("top_bun", ["row", "col", "z"], _item_type)
+
+    _grill_type = Type("grill", ["row", "col", "z"], _station_type)
+    _cutting_board_type = Type("cutting_board", ["row", "col", "z"],
+                               _station_type)
+
+    dir_to_enum = {"up": 0, "left": 1, "down": 2, "right": 3}
+    enum_to_dir = {0: "up", 1: "left", 2: "down", 3: "right"}
+
+    num_rows = CFG.gridworld_num_rows
+    num_cols = CFG.gridworld_num_cols
+
+    def __init__(self, use_gui: bool = True) -> None:
+        super().__init__(use_gui)
+
+        # Predicates
+        self._Adjacent = Predicate("Adjacent",
+                                   [self._robot_type, self._object_type],
+                                   self.Adjacent_holds)
+        self._AdjacentToNothing = Predicate("AdjacentToNothing",
+                                            [self._robot_type],
+                                            self._AdjacentToNothing_holds)
+        self._Facing = Predicate("Facing",
+                                 [self._robot_type, self._object_type],
+                                 self.Facing_holds)
+        self._AdjacentNotFacing = Predicate(
+            "AdjacentNotFacing", [self._robot_type, self._object_type],
+            self._AdjacentNotFacing_holds)
+        self._IsCooked = Predicate("IsCooked", [self._patty_type],
+                                   self._IsCooked_holds)
+        self._IsSliced = Predicate("IsSliced", [self._tomato_type],
+                                   self._IsSliced_holds)
+        self._HandEmpty = Predicate("HandEmpty", [self._robot_type],
+                                    self._HandEmpty_holds)
+        self._Holding = Predicate("Holding",
+                                  [self._robot_type, self._item_type],
+                                  self._Holding_holds)
+        self._On = Predicate("On", [self._item_type, self._object_type],
+                             self._On_holds)
+        self._OnNothing = Predicate("OnNothing", [self._object_type],
+                                    self._OnNothing_holds)
+        self._Clear = Predicate("Clear", [self._object_type],
+                                self._Clear_holds)
+        self._GoalHack = Predicate("GoalHack", [
+            self._bottom_bun_type, self._patty_type, self._cheese_type,
+            self._tomato_type, self._top_bun_type
+        ], self._GoalHack_holds)
+
+        # Static objects (exist no matter the settings)
+        self._robot = Object("robby", self._robot_type)
+        self._grill = Object("grill", self._grill_type)
+        self._cutting_board = Object("cutting_board", self._cutting_board_type)
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "burger"
+
+    @property
+    def types(self) -> Set[Type]:
+        return {
+            self._object_type, self._item_type, self._station_type,
+            self._robot_type, self._patty_type, self._tomato_type,
+            self._cheese_type, self._bottom_bun_type, self._top_bun_type,
+            self._grill_type, self._cutting_board_type
+        }
+
+    def _get_tasks(self, num: int,
+                   rng: np.random.Generator) -> List[EnvironmentTask]:
+        del rng  # unused
+        tasks = []
+
+        # Add robot, grill, and cutting board
+        state_dict = {}
+        hidden_state = {}
+        state_dict[self._robot] = {
+            "row": 2,
+            "col": 2,
+            "fingers": 0.0,
+            "dir": 3
+        }
+        state_dict[self._grill] = {"row": 2, "col": 3, "z": 0}
+        state_dict[self._cutting_board] = {"row": 1, "col": 3, "z": 0}
+
+        # Add patty
+        patty = Object("patty", self._patty_type)
+        state_dict[patty] = {"row": 0, "col": 0, "z": 0}
+        hidden_state[patty] = {"is_cooked": 0.0, "is_held": 0.0}
+
+        # Add tomato
+        tomato = Object("tomato", self._tomato_type)
+        state_dict[tomato] = {"row": 0, "col": 1, "z": 0}
+        hidden_state[tomato] = {"is_sliced": 0.0, "is_held": 0.0}
+
+        # Add cheese
+        cheese = Object("cheese", self._cheese_type)
+        state_dict[cheese] = {"row": 3, "col": 0, "z": 0}
+        hidden_state[cheese] = {"is_held": 0.0}
+
+        # Add top bun
+        top_bun = Object("top_bun", self._top_bun_type)
+        state_dict[top_bun] = {"row": 3, "col": 1, "z": 0}
+        hidden_state[top_bun] = {"is_held": 0.0}
+
+        # Add bottom bun
+        bottom_bun = Object("bottom_bun", self._bottom_bun_type)
+        state_dict[bottom_bun] = {"row": 0, "col": 2, "z": 0}
+        hidden_state[bottom_bun] = {"is_held": 0.0}
+
+        goal = {
+            GroundAtom(self._On, [patty, bottom_bun]),
+            GroundAtom(self._On, [cheese, patty]),
+            GroundAtom(self._On, [tomato, cheese]),
+            GroundAtom(self._On, [top_bun, tomato]),
+            GroundAtom(self._IsCooked, [patty]),
+            GroundAtom(self._IsSliced, [tomato]),
+        }
+
+        for _ in range(num):
+            state = utils.create_state_from_dict(state_dict)
+            state.simulator_state = hidden_state
+            # Note: this takes in Observation, GoalDescription, whose types are
+            # Any
+            tasks.append(EnvironmentTask(state, goal))
+
+        return tasks
+
+    def _generate_train_tasks(self) -> List[EnvironmentTask]:
+        return self._get_tasks(num=CFG.num_train_tasks, rng=self._train_rng)
+
+    def _generate_test_tasks(self) -> List[EnvironmentTask]:
+        return self._get_tasks(num=CFG.num_test_tasks, rng=self._test_rng)
+
+    @classmethod
+    def Adjacent_holds(cls, state: State, objects: Sequence[Object]) -> bool:
+        """Public for use by oracle options."""
+        robot, obj = objects
+        rx, ry = cls.get_position(robot, state)
+        ox, oy = cls.get_position(obj, state)
+        return cls.is_adjacent(rx, ry, ox, oy)
+
+    def _AdjacentToNothing_holds(self, state: State,
+                                 objects: Sequence[Object]) -> bool:
+        robot, = objects
+        for obj in state:
+            if obj.is_instance(self._item_type) or \
+                obj.is_instance(self._station_type):
+                if self.Adjacent_holds(state, [robot, obj]):
+                    return False
+        return True
+
+    @classmethod
+    def Facing_holds(cls, state: State, objects: Sequence[Object]) -> bool:
+        """Public for use by oracle options."""
+        robot, obj = objects
+        rx, ry = cls.get_position(robot, state)
+        rdir = state.get(robot, "dir")
+        ox, oy = cls.get_position(obj, state)
+        facing_left = ry == oy and rx - ox == 1 and cls.enum_to_dir[
+            rdir] == "left"
+        facing_right = ry == oy and rx - ox == -1 and cls.enum_to_dir[
+            rdir] == "right"
+        facing_down = ry - oy == 1 and rx == ox and cls.enum_to_dir[
+            rdir] == "down"
+        facing_up = ry - oy == -1 and rx == ox and cls.enum_to_dir[rdir] == "up"
+        return facing_left or facing_right or facing_down or facing_up
+
+    def _AdjacentNotFacing_holds(self, state: State,
+                                 objects: Sequence[Object]) -> bool:
+        return self.Adjacent_holds(
+            state, objects) and not self.Facing_holds(state, objects)
+
+    def _IsCooked_holds(self, state: State, objects: Sequence[Object]) -> bool:
+        patty, = objects
+        return state.simulator_state[patty][  # type: ignore[index]
+            "is_cooked"] > 0.5
+
+    def _IsSliced_holds(self, state: State, objects: Sequence[Object]) -> bool:
+        tomato, = objects
+        return state.simulator_state[tomato][  # type: ignore[index]
+            "is_sliced"] > 0.5
+
+    def _HandEmpty_holds(self, state: State,
+                         objects: Sequence[Object]) -> bool:
+        robot, = objects
+        return state.get(robot, "fingers") < 0.5
+
+    def _Holding_holds(self, state: State, objects: Sequence[Object]) -> bool:
+        robot, item = objects
+        return not self._HandEmpty_holds(
+            state,
+            [robot]) and state.simulator_state[item][  # type: ignore[index]
+                "is_held"] > 0.5
+
+    def _On_holds(self, state: State, objects: Sequence[Object]) -> bool:
+        a, b = objects
+        ax, ay = self.get_position(a, state)
+        bx, by = self.get_position(b, state)
+        az = state.get(a, "z")
+        bz = state.get(b, "z")
+        return ax == bx and ay == by and az - 1 == bz
+
+    def _OnNothing_holds(self, state: State,
+                         objects: Sequence[Object]) -> bool:
+        obj, = objects
+        for other_obj in state:
+            if other_obj.is_instance(self._item_type) or other_obj.is_instance(
+                    self._station_type):
+                if self._On_holds(state, [obj, other_obj]):
+                    return False
+        return True
+
+    def _Clear_holds(self, state: State, objects: Sequence[Object]) -> bool:
+        obj, = objects
+        for other_obj in state:
+            if other_obj.is_instance(self._item_type) or other_obj.is_instance(
+                    self._station_type):
+                if self._On_holds(state, [other_obj, obj]):
+                    return False
+        return True
+
+    def _GoalHack_holds(self, state: State, objects: Sequence[Object]) -> bool:
+        bottom, patty, cheese, tomato, top = objects
+        atoms = [
+            self._On_holds(state, [patty, bottom]),
+            self._On_holds(state, [cheese, patty]),
+            self._On_holds(state, [tomato, cheese]),
+            self._On_holds(state, [top, tomato]),
+            self._IsCooked_holds(state, [patty]),
+            self._IsSliced_holds(state, [tomato])
+        ]
+        return all(atoms)
+
+    @classmethod
+    def get_position(cls, obj: Object, state: State) -> Tuple[int, int]:
+        """Public for use by oracle options."""
+        col = state.get(obj, "col")
+        row = state.get(obj, "row")
+        return col, row
+
+    @classmethod
+    def is_adjacent(cls, col_1: int, row_1: int, col_2: int,
+                    row_2: int) -> bool:
+        """Public for use by oracle options."""
+        adjacent_vertical = col_1 == col_2 and abs(row_1 - row_2) == 1
+        adjacent_horizontal = row_1 == row_2 and abs(col_1 - col_2) == 1
+        return adjacent_vertical or adjacent_horizontal
+
+    @property
+    def predicates(self) -> Set[Predicate]:
+        return {
+            self._Adjacent, self._AdjacentToNothing, self._AdjacentNotFacing,
+            self._Facing, self._IsCooked, self._IsSliced, self._HandEmpty,
+            self._Holding, self._On, self._OnNothing, self._Clear,
+            self._GoalHack
+        }
+
+    @property
+    def goal_predicates(self) -> Set[Predicate]:
+        return {self._On, self._IsCooked, self._IsSliced}
+
+    @property
+    def action_space(self) -> Box:
+        # dx (column), dy (row), direction, cut/cook, pick/place
+        # We expect dx and dy to be one of -1, 0, or 1.
+        # We expect direction to be one of -1, 0, 1, 2, or 3. -1 signifies
+        # "no change in direction", and 0, 1, 2, and 3 signify a direction.
+        # We expect cut/cook and pick/place to be 0 or 1.
+        return Box(low=np.array([-1.0, -1.0, -1.0, 0.0, 0.0]),
+                   high=np.array([1.0, 1.0, 3.0, 1.0, 1.0]),
+                   dtype=np.float32)
+
+    @staticmethod
+    def _get_robot_direction(dx: float, dy: float) -> str:
+        if dx < 0:
+            return "left"
+        if dx > 0:
+            return "right"
+        if dy < 0:
+            return "down"
+        if dy > 0:
+            return "up"
+        return "no_change"
+
+    @staticmethod
+    def get_cell_in_direction(x: int, y: int,
+                              direction: str) -> Tuple[int, int]:
+        """Public for use by tests."""
+        if direction == "left":
+            return (x - 1, y)
+        if direction == "right":
+            return (x + 1, y)
+        if direction == "up":
+            return (x, y + 1)
+        if direction == "down":
+            return (x, y - 1)
+        return (x, y)
+
+    @classmethod
+    def get_empty_cells(cls, state: State) -> Set[Tuple[int, int]]:
+        """Public for use by oracle options."""
+        cells = set()
+        for y in range(cls.num_rows):
+            for x in range(cls.num_cols):
+                cells.add((x, y))
+
+        for obj in state:
+            x, y = cls.get_position(obj, state)
+            if (x, y) in cells:
+                cells.remove((x, y))
+
+        return set(cells)
+
+    def simulate(self, state: State, action: Action) -> State:
+        # We assume only one of <dcol, drow>, <direction>, <interact>,
+        # <pickplace> is not "null" in each action.
+        # If each one was null, the action would be <0, 0, -1, 0, 0>.
+        assert self.action_space.contains(action.arr)
+        next_state = state.copy()
+        dcol, drow, dir_from_turning, interact, pickplace = action.arr
+
+        rx, ry = self.get_position(self._robot, state)
+        new_rx = np.clip(rx + dcol, 0, self.num_cols - 1)
+        new_ry = np.clip(ry + drow, 0, self.num_rows - 1)
+
+        # Compute the robot's direction.
+        dir_from_movement = self._get_robot_direction(dcol, drow)
+        if dir_from_movement != "no_change":
+            next_state.set(self._robot, "dir",
+                           self.dir_to_enum[dir_from_movement])
+        elif dir_from_turning in [0, 1, 2, 3]:
+            next_state.set(self._robot, "dir", dir_from_turning)
+
+        # Get the objects we can interact with.
+        items = [obj for obj in state if obj.is_instance(self._item_type)]
+
+        # Check for collision.
+        other_objects = [
+            obj for obj in state if not obj.is_instance(self._robot_type)
+        ]
+        for obj in other_objects:
+            if obj in items:
+                if state.simulator_state[obj][  # type: ignore[index]
+                        "is_held"] > 0.5:
+                    continue
+            ox, oy = self.get_position(obj, state)
+            if abs(new_rx - ox) < 1e-3 and abs(new_ry - oy) < 1e-3:
+                return next_state
+
+        # No collision detected, so we can move the robot.
+        next_state.set(self._robot, "col", new_rx)
+        next_state.set(self._robot, "row", new_ry)
+
+        # If an object was held, move it with the robot.
+        for item in items:
+            if state.simulator_state[item][  # type: ignore[index]
+                    "is_held"] > 0.5:
+                next_state.set(item, "col", new_rx)
+                next_state.set(item, "row", new_ry)
+
+        # Handle interaction (cutting or cooking).
+        for item in items:
+            if self.Facing_holds(state,
+                                 [self._robot, item]) and interact > 0.5:
+                if item.is_instance(self._patty_type) and self._On_holds(
+                        state, [item, self._grill]):
+                    next_state.simulator_state[item][  # type: ignore[index]
+                        "is_cooked"] = 1.0
+                elif item.is_instance(self._tomato_type) and self._On_holds(
+                        state, [item, self._cutting_board]):
+                    next_state.simulator_state[item][  # type: ignore[index]
+                        "is_sliced"] = 1.0
+
+        # Handle picking.
+        if pickplace > 0.5 and self._HandEmpty_holds(state, [self._robot]):
+            facing_items = []
+            for item in items:
+                if self.Facing_holds(state, [self._robot, item]):
+                    facing_items.append((item, state.get(item, "z")))
+            if len(facing_items) > 0:
+                # We'll pick up the item that is "on top".
+                on_top = max(facing_items, key=lambda x: x[1])[0]
+                next_state.simulator_state[on_top][  # type: ignore[index]
+                    "is_held"] = 1.0
+                next_state.set(on_top, "col", rx)
+                next_state.set(on_top, "row", ry)
+                next_state.set(on_top, "z", 0)
+                next_state.set(self._robot, "fingers", 1.0)
+
+        # Handle placing.
+        if pickplace > 0.5 and not self._HandEmpty_holds(state, [self._robot]):
+            held_item = [
+                item for item in items
+                if state.simulator_state[item]  # type: ignore[index]
+                ["is_held"] > 0.5
+            ][0]
+            px, py = self.get_cell_in_direction(
+                rx, ry, self.enum_to_dir[state.get(self._robot, "dir")])
+            if 0 <= py <= self.num_rows and 0 <= px <= self.num_cols:
+                next_state.set(self._robot, "fingers", 0.0)
+                next_state.simulator_state[held_item][  # type: ignore[index]
+                    "is_held"] = 0.0
+                next_state.set(held_item, "col", px)
+                next_state.set(held_item, "row", py)
+                # If any other objects are at this location, then this must go
+                # on top of them.
+                objects_at_loc = []
+                for obj in other_objects:
+                    ox, oy = self.get_position(obj, state)
+                    if ox == px and oy == py:
+                        objects_at_loc.append((obj, state.get(obj, "z")))
+                if len(objects_at_loc) > 0:
+                    new_z = max(objects_at_loc, key=lambda x: x[1])[1] + 1
+                else:
+                    new_z = 0
+                next_state.set(held_item, "z", new_z)
+
+        return next_state
+
+    def render_state_plt(
+            self,
+            state: State,
+            task: EnvironmentTask,
+            action: Optional[Action] = None,
+            caption: Optional[str] = None) -> matplotlib.figure.Figure:
+        figsize = (self.num_cols * 2, self.num_rows * 2)
+        fig, ax = plt.subplots(1, 1, figsize=figsize)
+        plt.suptitle(caption, wrap=True)
+
+        # Plot vertical lines
+        for i in range(self.num_cols + 1):
+            ax.axvline(x=i, color="k", linestyle="-")
+
+        # Plot horizontal lines
+        for i in range(self.num_rows + 1):
+            ax.axhline(y=i, color="k", linestyle="-")
+
+        # Draw robot
+        x, y = self.get_position(self._robot, state)
+        # ax.plot(robot_col + 0.5, robot_row + 0.5, 'rs', markersize=20)
+        robot_direction = self.enum_to_dir[state.get(self._robot, "dir")]
+        robot_img = mpimg.imread(
+            utils.get_env_asset_path(f"imgs/robot_{robot_direction}.png"))
+        img_size = (0.8, 0.8)
+        ax.imshow(robot_img,
+                  extent=[
+                      x + (1 - img_size[0]) / 2, x + (1 + img_size[0]) / 2,
+                      y + (1 - img_size[1]) / 2, y + (1 + img_size[1]) / 2
+                  ])
+
+        # Draw grill
+        x, y = self.get_position(self._grill, state)
+        grill_img = mpimg.imread(utils.get_env_asset_path("imgs/grill.png"))
+        ax.imshow(grill_img, extent=[x, x + 1, y, y + 1])
+
+        # Draw cutting board
+        x, y = self.get_position(self._cutting_board, state)
+        cutting_board_img = mpimg.imread(
+            utils.get_env_asset_path("imgs/cutting_board.png"))
+        ax.imshow(cutting_board_img, extent=[x, x + 1, y, y + 1])
+
+        # Draw items
+        type_to_img = {
+            self._top_bun_type:
+            mpimg.imread("predicators/envs/assets/imgs/top_bun.png"),
+            self._bottom_bun_type:
+            mpimg.imread("predicators/envs/assets/imgs/bottom_bun.png"),
+            self._cheese_type:
+            mpimg.imread("predicators/envs/assets/imgs/cheese.png"),
+            self._tomato_type:
+            mpimg.imread(utils.get_env_asset_path("imgs/whole_tomato.png")),
+            self._patty_type:
+            mpimg.imread(utils.get_env_asset_path("imgs/raw_patty.png"))
+        }
+        held_img_size = (0.3, 0.3)
+        offset = held_img_size[1] * (1 / 3)
+        items = [obj for obj in state if obj.is_instance(self._item_type)]
+        for item in items:
+            img = type_to_img[item.type]
+            if "is_cooked" in state.simulator_state[  # type: ignore[index]
+                    item] and self._IsCooked_holds(state, [item]):
+                img = mpimg.imread(
+                    utils.get_env_asset_path("imgs/cooked_patty.png"))
+            elif "is_sliced" in state.simulator_state[  # type: ignore[index]
+                    item] and self._IsSliced_holds(state, [item]):
+                img = mpimg.imread(
+                    utils.get_env_asset_path("imgs/sliced_tomato.png"))
+            zorder = state.get(item, "z")
+            is_held = state.simulator_state[item][  # type: ignore[index]
+                "is_held"] > 0.5
+            x, y = self.get_position(item, state)
+            # If the item is held, make it smaller so that it does obstruct the
+            # robot.
+            img_size = (0.7, 0.7)
+            if is_held:
+                extent = [
+                    x + (1 - held_img_size[0]) * (1 / 2),
+                    x + (1 + held_img_size[0]) * (1 / 2), y + offset,
+                    y + held_img_size[1] + offset
+                ]
+            # If the item is on top of something else, make it look like it by
+            # moving it up a little.
+            elif zorder > 0:
+                offset = 0.1 * zorder
+                extent = [
+                    x + (1 - img_size[0]) * (1 / 2),
+                    x + (1 + img_size[0]) * (1 / 2),
+                    y + (1 - img_size[1]) / 2 + offset,
+                    y + (1 + img_size[1]) / 2 + offset
+                ]
+            else:
+                extent = [
+                    x + (1 - img_size[0]) * (1 / 2),
+                    x + (1 + img_size[0]) * (1 / 2), y + (1 - img_size[1]) / 2,
+                    y + (1 + img_size[1]) / 2
+                ]
+            ax.imshow(img, extent=extent, zorder=zorder)
+
+        # Draw background
+        floor_img = mpimg.imread(
+            utils.get_env_asset_path("imgs/floorwood.png"))
+        for y in range(self.num_rows):
+            for x in range(self.num_cols):
+                ax.imshow(floor_img, extent=[x, x + 1, y, y + 1], zorder=-1)
+
+        ax.set_xlim(0, self.num_cols)
+        ax.set_ylim(0, self.num_rows)
+        ax.set_aspect("equal")
+        ax.axis("off")
+        plt.tight_layout()
+        return fig
+
+    def get_event_to_action_fn(
+            self) -> Callable[[State, matplotlib.backend_bases.Event], Action]:
+
+        def _event_to_action(state: State,
+                             event: matplotlib.backend_bases.Event) -> Action:
+            del state  # unused
+            logging.info(
+                "Controls: arrow keys to move, wasd to change direction, " \
+                "(e) to interact, (f) to pick/place, (q) to quit"
+            )
+            dcol, drow, turn, interact, pickplace = 0, 0, -1, 0, 0
+            if event.key == "q":
+                raise utils.HumanDemonstrationFailure("Human quit.")
+            if event.key == "w":
+                turn = 0
+            elif event.key == "a":
+                turn = 1
+            elif event.key == "s":
+                turn = 2
+            elif event.key == "d":
+                turn = 3
+            elif event.key == "left":
+                drow = 0
+                dcol = -1
+            elif event.key == "right":
+                drow = 0
+                dcol = 1
+            elif event.key == "down":
+                drow = -1
+                dcol = 0
+            elif event.key == "up":
+                drow = 1
+                dcol = 0
+            elif event.key == "e":
+                interact = 1
+            elif event.key == "f":
+                pickplace = 1
+            action = Action(
+                np.array([dcol, drow, turn, interact, pickplace],
+                         dtype=np.float32))
+            return action
+
+        return _event_to_action
diff --git a/predicators/envs/kitchen.py b/predicators/envs/kitchen.py
index 554a9b28f7..0329951442 100644
--- a/predicators/envs/kitchen.py
+++ b/predicators/envs/kitchen.py
@@ -43,8 +43,9 @@
 class KitchenEnv(BaseEnv):
     """Kitchen environment wrapping dm_control Kitchen."""
 
-    gripper_type = Type("gripper", ["x", "y", "z", "qw", "qx", "qy", "qz"])
     object_type = Type("object", ["x", "y", "z"])
+    gripper_type = Type("gripper", ["x", "y", "z", "qw", "qx", "qy", "qz"],
+                        parent=object_type)
     on_off_type = Type("on_off", ["x", "y", "z", "angle"], parent=object_type)
     hinge_door_type = Type("hinge_door", ["x", "y", "z", "angle"],
                            parent=on_off_type)
@@ -102,7 +103,9 @@ class KitchenEnv(BaseEnv):
     def __init__(self, use_gui: bool = True) -> None:
         super().__init__(use_gui)
         assert _MJKITCHEN_IMPORTED, "Failed to import kitchen gym env. \
-Install from https://github.com/SiddarGu/Gymnasium-Robotics.git"
+Install from https://github.com/SiddarGu/Gymnasium-Robotics.git. \
+BE SURE TO INSTALL FROM GITHUB SOURCE THOUGH; do not blindly install as the \
+README of that repo suggests!"
 
         if use_gui:
             assert not CFG.make_test_videos or CFG.make_failure_videos, \
@@ -112,7 +115,8 @@ def __init__(self, use_gui: bool = True) -> None:
 
         render_mode = "human" if self._using_gui else "rgb_array"
         self._gym_env = mujoco_kitchen_gym.make("FrankaKitchen-v1",
-                                                render_mode=render_mode)
+                                                render_mode=render_mode,
+                                                ik_controller=True)
 
     def _generate_train_tasks(self) -> List[EnvironmentTask]:
         return self._get_tasks(num=CFG.num_train_tasks, train_or_test="train")
@@ -188,11 +192,19 @@ def predicates(self) -> Set[Predicate]:
 
     @property
     def goal_predicates(self) -> Set[Predicate]:
-        return {
-            self._pred_name_to_pred["OnTop"],
-            self._pred_name_to_pred["TurnedOn"],
-            self._pred_name_to_pred["Open"]
-        }
+        OnTop = self._pred_name_to_pred["OnTop"]
+        TurnedOn = self._pred_name_to_pred["TurnedOn"]
+        KettleBoiling = self._pred_name_to_pred["KettleBoiling"]
+        goal_preds = set()
+        if CFG.kitchen_goals in ["all", "kettle_only"]:
+            goal_preds.add(OnTop)
+        if CFG.kitchen_goals in ["all", "knob_only"]:
+            goal_preds.add(TurnedOn)
+        if CFG.kitchen_goals in ["all", "light_only"]:
+            goal_preds.add(TurnedOn)
+        if CFG.kitchen_goals in ["all", "boil_kettle"]:
+            goal_preds.add(KettleBoiling)
+        return goal_preds
 
     @classmethod
     def create_predicates(cls) -> Dict[str, Predicate]:
@@ -214,7 +226,17 @@ def create_predicates(cls) -> Dict[str, Predicate]:
             Predicate("TurnedOff", [cls.on_off_type], cls.Off_holds),
             Predicate("Open", [cls.on_off_type], cls.Open_holds),
             Predicate("Closed", [cls.on_off_type], cls.Closed_holds),
+            Predicate("BurnerAhead", [cls.surface_type, cls.surface_type],
+                      cls._BurnerAhead_holds),
+            Predicate("BurnerBehind", [cls.surface_type, cls.surface_type],
+                      cls._BurnerBehind_holds),
+            Predicate("KettleBoiling",
+                      [cls.kettle_type, cls.surface_type, cls.knob_type],
+                      cls._KettleBoiling_holds),
+            Predicate("KnobAndBurnerLinked", [cls.knob_type, cls.surface_type],
+                      cls._KnobAndBurnerLinkedHolds),
         }
+
         return {p.name: p for p in preds}
 
     @property
@@ -244,15 +266,17 @@ def reset(self, train_or_test: str, task_idx: int) -> Observation:
         return self._copy_observation(self._current_observation)
 
     def simulate(self, state: State, action: Action) -> State:
-        raise NotImplementedError("Simulate not implemented for gym envs. " +
-                                  "Try using --bilevel_plan_without_sim True")
+        raise NotImplementedError(
+            "Simulate not implemented for kitchen env. " +
+            "Try using --bilevel_plan_without_sim True")
 
     def step(self, action: Action) -> Observation:
         self._gym_env.step(action.arr)
         if self._using_gui:
             self._gym_env.render()
         self._current_observation = {
-            "state_info": self.get_object_centric_state_info()
+            "state_info": self.get_object_centric_state_info(),
+            "obs_images": [self._gym_env.render()]
         }
         return self._copy_observation(self._current_observation)
 
@@ -303,6 +327,8 @@ def goal_reached(self) -> bool:
         kettle_on_burner = self._OnTop_holds(state, [kettle, burner4])
         knob4_turned_on = self.On_holds(state, [knob4])
         light_turned_on = self.On_holds(state, [light])
+        kettle_boiling = self._KettleBoiling_holds(state,
+                                                   [kettle, burner4, knob4])
         if goal_desc == ("Move the kettle to the back burner and turn it on; "
                          "also turn on the light"):
             return kettle_on_burner and knob4_turned_on and light_turned_on
@@ -312,6 +338,8 @@ def goal_reached(self) -> bool:
             return knob4_turned_on
         if goal_desc == "Turn on the light":
             return light_turned_on
+        if goal_desc == "Move the kettle to the back burner and turn it on":
+            return kettle_boiling
         raise NotImplementedError(f"Unrecognized goal: {goal_desc}")
 
     def _get_tasks(self, num: int,
@@ -319,7 +347,7 @@ def _get_tasks(self, num: int,
         tasks = []
 
         assert CFG.kitchen_goals in [
-            "all", "kettle_only", "knob_only", "light_only"
+            "all", "kettle_only", "knob_only", "light_only", "boil_kettle"
         ]
         goal_descriptions: List[str] = []
         if CFG.kitchen_goals in ["all", "kettle_only"]:
@@ -328,6 +356,9 @@ def _get_tasks(self, num: int,
             goal_descriptions.append("Turn on the back burner")
         if CFG.kitchen_goals in ["all", "light_only"]:
             goal_descriptions.append("Turn on the light")
+        if CFG.kitchen_goals in ["all", "boil_kettle"]:
+            goal_descriptions.append(
+                "Move the kettle to the back burner and turn it on")
         if CFG.kitchen_goals == "all":
             desc = ("Move the kettle to the back burner and turn it on; also "
                     "turn on the light")
@@ -344,7 +375,10 @@ def _get_tasks(self, num: int,
 
     def _reset_initial_state_from_seed(self, seed: int) -> Observation:
         self._gym_env.reset(seed=seed)
-        return {"state_info": self.get_object_centric_state_info()}
+        return {
+            "state_info": self.get_object_centric_state_info(),
+            "obs_images": [self._gym_env.render()]
+        }
 
     @classmethod
     def _AtPreTurn_holds(cls, state: State, objects: Sequence[Object],
@@ -443,7 +477,7 @@ def _NotOnTop_holds(cls, state: State, objects: Sequence[Object]) -> bool:
     def On_holds(cls,
                  state: State,
                  objects: Sequence[Object],
-                 thresh_pad: float = 0.0) -> bool:
+                 thresh_pad: float = -0.03) -> bool:
         """Made public for use in ground-truth options."""
         obj = objects[0]
         if obj.is_instance(cls.knob_type):
@@ -499,5 +533,44 @@ def Closed_holds(cls,
             return state.get(obj, "x") <= cls.cabinet_open_thresh - thresh_pad
         return False
 
+    @classmethod
+    def _BurnerAhead_holds(cls, state: State,
+                           objects: Sequence[Object]) -> bool:
+        """Static predicate useful for deciding between pushing or pulling the
+        kettle."""
+        burner1, burner2 = objects
+        if burner1 == burner2:
+            return False
+        return state.get(burner1, "y") > state.get(burner2, "y")
+
+    @classmethod
+    def _BurnerBehind_holds(cls, state: State,
+                            objects: Sequence[Object]) -> bool:
+        """Static predicate useful for deciding between pushing or pulling the
+        kettle."""
+        burner1, burner2 = objects
+        if burner1 == burner2:
+            return False
+        return not cls._BurnerAhead_holds(state, objects)
+
+    @classmethod
+    def _KettleBoiling_holds(cls, state: State,
+                             objects: Sequence[Object]) -> bool:
+        """Predicate that's necessary for goal specification."""
+        kettle, burner, knob = objects
+        return cls.On_holds(state, [knob]) and cls._OnTop_holds(
+            state, [kettle, burner])
+
+    @classmethod
+    def _KnobAndBurnerLinkedHolds(cls, state: State,
+                                  objects: Sequence[Object]) -> bool:
+        """Predicate that's necessary for goal specification."""
+        del state  # unused
+        knob, burner = objects
+        # NOTE: we assume the knobs and burners are
+        # all named "knob1", "burner1", .... And that "knob1" corresponds
+        # to "burner1"
+        return knob.name[-1] == burner.name[-1]
+
     def _copy_observation(self, obs: Observation) -> Observation:
         return copy.deepcopy(obs)
diff --git a/predicators/envs/vlm_envs.py b/predicators/envs/vlm_envs.py
index 9cba7d610e..a065efa3f5 100644
--- a/predicators/envs/vlm_envs.py
+++ b/predicators/envs/vlm_envs.py
@@ -4,7 +4,6 @@
 future.
 """
 
-import abc
 from typing import List, Optional, Sequence, Set
 
 import matplotlib
@@ -14,7 +13,7 @@
 from predicators.envs import BaseEnv
 from predicators.settings import CFG
 from predicators.structs import Action, EnvironmentTask, GroundAtom, Object, \
-    Predicate, State, Type
+    Predicate, State, Task, Type
 
 DUMMY_GOAL_OBJ_NAME = "dummy_goal_obj"  # used in VLM parsing as well.
 
@@ -82,14 +81,6 @@ def _get_tasks(
         del num, rng
         raise NotImplementedError("Override!")
 
-    @property
-    @abc.abstractmethod
-    def vlm_debug_atom_strs(self) -> Set[str]:
-        """Return a set of atom strings that should be sufficient for a VLM to
-        label demonstrations consistently to learn good operators."""
-        raise NotImplementedError(
-            "VLM debug atom strings not implemented for this environment.")
-
 
 class IceTeaMakingEnv(VLMPredicateEnv):
     """A (simplified) version of a tea-making task that's closer to pick-and-
@@ -140,11 +131,8 @@ def _get_tasks(self, num: int,
             for _ in range(num)
         ]
 
-    @property
-    def vlm_debug_atom_strs(self) -> Set[str]:
-        """A 'debug grammar' set of predicates that should be sufficient for
-        completing the task; useful for comparing different methods of VLM
-        truth-value labelling given the same set of atom proposals to label."""
+    def get_vlm_debug_atom_strs(self, train_tasks: List[Task]) -> Set[str]:
+        del train_tasks
         return set([
             "hand_grasping_spoon(hand, spoon)",
             "hand_grasping_teabag(hand, teabag)", "spoon_in_cup(spoon, cup)",
diff --git a/predicators/execution_monitoring/expected_atoms_monitor.py b/predicators/execution_monitoring/expected_atoms_monitor.py
index 884483b15e..38cc32329c 100644
--- a/predicators/execution_monitoring/expected_atoms_monitor.py
+++ b/predicators/execution_monitoring/expected_atoms_monitor.py
@@ -3,10 +3,11 @@
 
 import logging
 
+from predicators import utils
 from predicators.execution_monitoring.base_execution_monitor import \
     BaseExecutionMonitor
 from predicators.settings import CFG
-from predicators.structs import State
+from predicators.structs import State, VLMPredicate
 
 
 class ExpectedAtomsExecutionMonitor(BaseExecutionMonitor):
@@ -30,7 +31,19 @@ def step(self, state: State) -> bool:
         self._curr_plan_timestep += 1
         # If the expected atoms are a subset of the current atoms, then
         # we don't have to replan.
-        unsat_atoms = {a for a in next_expected_atoms if not a.holds(state)}
+        next_expected_vlm_atoms = set(
+            atom for atom in next_expected_atoms
+            if isinstance(atom.predicate, VLMPredicate))
+        non_vlm_unsat_atoms = {
+            a
+            for a in (next_expected_atoms - next_expected_vlm_atoms)
+            if not a.holds(state)
+        }
+        vlm_unsat_atoms = set()
+        if len(next_expected_vlm_atoms) > 0:
+            vlm_unsat_atoms = utils.query_vlm_for_atom_vals(
+                next_expected_vlm_atoms, state)  # pragma: no cover
+        unsat_atoms = non_vlm_unsat_atoms | vlm_unsat_atoms
         if not unsat_atoms:
             return False
         logging.info(
diff --git a/predicators/ground_truth_models/burger/__init__.py b/predicators/ground_truth_models/burger/__init__.py
new file mode 100644
index 0000000000..8b9dd72f9e
--- /dev/null
+++ b/predicators/ground_truth_models/burger/__init__.py
@@ -0,0 +1,6 @@
+"""Ground truth models for burger environment."""
+
+from .nsrts import BurgerGroundTruthNSRTFactory
+from .options import BurgerGroundTruthOptionFactory
+
+__all__ = ["BurgerGroundTruthOptionFactory", "BurgerGroundTruthNSRTFactory"]
diff --git a/predicators/ground_truth_models/burger/nsrts.py b/predicators/ground_truth_models/burger/nsrts.py
new file mode 100644
index 0000000000..408eadb885
--- /dev/null
+++ b/predicators/ground_truth_models/burger/nsrts.py
@@ -0,0 +1,601 @@
+"""Ground-truth NSRTs for the burger environment."""
+
+from typing import Dict, Set
+
+from predicators.ground_truth_models import GroundTruthNSRTFactory
+from predicators.structs import NSRT, LiftedAtom, ParameterizedOption, \
+    Predicate, Type, Variable
+from predicators.utils import null_sampler
+
+
+class BurgerGroundTruthNSRTFactory(GroundTruthNSRTFactory):
+    """Ground-truth NSRTs for the Burger environment."""
+
+    @classmethod
+    def get_env_names(cls) -> Set[str]:
+        return {"burger"}
+
+    @staticmethod
+    def get_nsrts(env_name: str, types: Dict[str, Type],
+                  predicates: Dict[str, Predicate],
+                  options: Dict[str, ParameterizedOption]) -> Set[NSRT]:
+
+        # Types
+        tomato_type = types["tomato"]
+        patty_type = types["patty"]
+
+        grill_type = types["grill"]
+        cutting_board_type = types["cutting_board"]
+        robot_type = types["robot"]
+
+        item_type = types["item"]
+        object_type = types["object"]
+
+        # Variables
+        tomato = Variable("?tomato", tomato_type)
+        patty = Variable("?patty", patty_type)
+
+        grill = Variable("?grill", grill_type)
+        cutting_board = Variable("?cutting_board", cutting_board_type)
+        robot = Variable("?robot", robot_type)
+
+        item = Variable("?item", item_type)
+        obj = Variable("?object", object_type)
+
+        from_obj1 = Variable("?from_obj1", item_type)
+        from_obj2 = Variable("?from_obj2", item_type)
+        from_obj3 = Variable("?from_obj3", item_type)
+        from_obj4 = Variable("?from_obj4", object_type)
+
+        to_obj1 = Variable("?to_obj1", item_type)
+        to_obj2 = Variable("?to_obj2", item_type)
+        to_obj3 = Variable("?to_obj3", item_type)
+        to_obj4 = Variable("?to_obj4", object_type)
+
+        # Predicates
+        Adjacent = predicates["Adjacent"]
+        AdjacentToNothing = predicates["AdjacentToNothing"]
+        Facing = predicates["Facing"]
+        IsCooked = predicates["IsCooked"]
+        IsSliced = predicates["IsSliced"]
+        HandEmpty = predicates["HandEmpty"]
+        Holding = predicates["Holding"]
+        On = predicates["On"]
+        OnNothing = predicates["OnNothing"]
+        Clear = predicates["Clear"]
+
+        # Options
+        Move = options["Move"]
+        Pick = options["Pick"]
+        Place = options["Place"]
+        Cook = options["Cook"]
+        Slice = options["Slice"]
+
+        nsrts = set()
+
+        # Slice
+        parameters = [robot, tomato, cutting_board]
+        option_vars = [robot, tomato, cutting_board]
+        option = Slice
+        preconditions = {
+            LiftedAtom(Clear, [tomato]),
+            LiftedAtom(On, [tomato, cutting_board]),
+            LiftedAtom(Facing, [robot, tomato])
+        }
+        add_effects = {LiftedAtom(IsSliced, [tomato])}
+        delete_effects: Set[LiftedAtom] = set()
+        ignore_effects: Set[Predicate] = set()
+        slice_nsrt = NSRT("Slice", parameters, preconditions, add_effects,
+                          delete_effects, ignore_effects, option, option_vars,
+                          null_sampler)
+        nsrts.add(slice_nsrt)
+
+        # Cook
+        parameters = [robot, patty, grill]
+        option_vars = [robot, patty, grill]
+        option = Cook
+        preconditions = {
+            LiftedAtom(Clear, [patty]),
+            LiftedAtom(On, [patty, grill]),
+            LiftedAtom(Facing, [robot, patty])
+        }
+        add_effects = {LiftedAtom(IsCooked, [patty])}
+        delete_effects = set()
+        ignore_effects = set()
+        cook_nsrt = NSRT("Cook", parameters, preconditions, add_effects,
+                         delete_effects, ignore_effects, option, option_vars,
+                         null_sampler)
+        nsrts.add(cook_nsrt)
+
+        # NOTE: this nsrt will be relevant after the environment is updated to
+        # have more variation in the tasks' initial states.
+        # # MoveWhenAlreadyAdjacent
+        # parameters = [robot, to_obj, from_obj3]
+        # option_vars = [robot, to_obj]
+        # option = Move
+        # preconditions = {
+        #     LiftedAtom(Adjacent, [robot, from_obj3]),
+        #     LiftedAtom(Adjacent, [robot, to_obj]),
+        #     LiftedAtom(Facing, [robot, from_obj3])
+        # }
+        # add_effects = {
+        #     LiftedAtom(Facing, [robot, to_obj])
+        # }
+        # delete_effects = {
+        #     LiftedAtom(Facing, [robot, from_obj3])
+        # }
+        # ignore_effects = set()
+        # move_when_already_adjacent_nsrt = NSRT(
+        #     "MoveWhenAlreadyAdjacent",
+        #     parameters,
+        #     preconditions,
+        #     add_effects,
+        #     delete_effects,
+        #     ignore_effects,
+        #     option,
+        #     option_vars,
+        #     null_sampler
+        # )
+        # nsrts.add(move_when_already_adjacent_nsrt)
+
+        # MoveFromNothingToOneStack
+        parameters = [robot, to_obj4]
+        option_vars = [robot, to_obj4]
+        option = Move
+        preconditions = {
+            LiftedAtom(AdjacentToNothing, [robot]),
+            LiftedAtom(Clear, [to_obj4]),
+            LiftedAtom(OnNothing, [to_obj4])
+        }
+        add_effects = {
+            LiftedAtom(Adjacent, [robot, to_obj4]),
+            LiftedAtom(Facing, [robot, to_obj4])
+        }
+        delete_effects = {LiftedAtom(AdjacentToNothing, [robot])}
+        ignore_effects = set()
+        move_from_nothing_to_one_stack_nsrt = NSRT("MoveFromNothingToOneStack",
+                                                   parameters, preconditions,
+                                                   add_effects, delete_effects,
+                                                   ignore_effects, option,
+                                                   option_vars, null_sampler)
+        nsrts.add(move_from_nothing_to_one_stack_nsrt)
+
+        # MoveFromNothingToTwoStack
+        parameters = [robot, to_obj1, to_obj4]
+        option_vars = [robot, to_obj1]
+        option = Move
+        preconditions = {
+            LiftedAtom(AdjacentToNothing, [robot]),
+            LiftedAtom(Clear, [to_obj1]),
+            LiftedAtom(On, [to_obj1, to_obj4]),
+            LiftedAtom(OnNothing, [to_obj4]),
+        }
+        add_effects = {
+            LiftedAtom(Adjacent, [robot, to_obj1]),
+            LiftedAtom(Facing, [robot, to_obj1]),
+            LiftedAtom(Adjacent, [robot, to_obj4]),
+            LiftedAtom(Facing, [robot, to_obj4]),
+        }
+        delete_effects = {LiftedAtom(AdjacentToNothing, [robot])}
+        ignore_effects = set()
+        move_from_nothing_to_two_stack_nsrt = NSRT("MoveFromNothingToTwoStack",
+                                                   parameters, preconditions,
+                                                   add_effects, delete_effects,
+                                                   ignore_effects, option,
+                                                   option_vars, null_sampler)
+        nsrts.add(move_from_nothing_to_two_stack_nsrt)
+
+        # MoveFromNothingToThreeStack
+        parameters = [robot, to_obj1, to_obj2, to_obj4]
+        option_vars = [robot, to_obj1]
+        option = Move
+        preconditions = {
+            LiftedAtom(AdjacentToNothing, [robot]),
+            LiftedAtom(Clear, [to_obj1]),
+            LiftedAtom(On, [to_obj1, to_obj2]),
+            LiftedAtom(On, [to_obj2, to_obj4]),
+            LiftedAtom(OnNothing, [to_obj4]),
+        }
+        add_effects = {
+            LiftedAtom(Adjacent, [robot, to_obj1]),
+            LiftedAtom(Facing, [robot, to_obj1]),
+            LiftedAtom(Adjacent, [robot, to_obj2]),
+            LiftedAtom(Facing, [robot, to_obj2]),
+            LiftedAtom(Adjacent, [robot, to_obj4]),
+            LiftedAtom(Facing, [robot, to_obj4]),
+        }
+        delete_effects = {LiftedAtom(AdjacentToNothing, [robot])}
+        ignore_effects = set()
+        move_from_nothing_to_three_stack_nsrt = NSRT(
+            "MoveFromNothingToThreeStack", parameters, preconditions,
+            add_effects, delete_effects, ignore_effects, option, option_vars,
+            null_sampler)
+        nsrts.add(move_from_nothing_to_three_stack_nsrt)
+
+        # MoveFromNothingToFourStack
+        parameters = [robot, to_obj1, to_obj2, to_obj3, to_obj4]
+        option_vars = [robot, to_obj1]
+        option = Move
+        preconditions = {
+            LiftedAtom(AdjacentToNothing, [robot]),
+            LiftedAtom(Clear, [to_obj1]),
+            LiftedAtom(On, [to_obj1, to_obj2]),
+            LiftedAtom(On, [to_obj2, to_obj3]),
+            LiftedAtom(On, [to_obj3, to_obj4]),
+            LiftedAtom(OnNothing, [to_obj4]),
+        }
+        add_effects = {
+            LiftedAtom(Adjacent, [robot, to_obj1]),
+            LiftedAtom(Facing, [robot, to_obj1]),
+            LiftedAtom(Adjacent, [robot, to_obj2]),
+            LiftedAtom(Facing, [robot, to_obj2]),
+            LiftedAtom(Adjacent, [robot, to_obj3]),
+            LiftedAtom(Facing, [robot, to_obj3]),
+            LiftedAtom(Adjacent, [robot, to_obj4]),
+            LiftedAtom(Facing, [robot, to_obj4])
+        }
+        delete_effects = {LiftedAtom(AdjacentToNothing, [robot])}
+        ignore_effects = set()
+        move_from_nothing_to_four_stack_nsrt = NSRT(
+            "MoveFromNothingToFourStack", parameters, preconditions,
+            add_effects, delete_effects, ignore_effects, option, option_vars,
+            null_sampler)
+        nsrts.add(move_from_nothing_to_four_stack_nsrt)
+
+        # MoveWhenFacingOneStack
+        parameters = [robot, to_obj4, from_obj4]
+        option_vars = [robot, to_obj4]
+        option = Move
+        preconditions = {
+            LiftedAtom(Adjacent, [robot, from_obj4]),
+            LiftedAtom(Facing, [robot, from_obj4]),
+            LiftedAtom(Clear, [from_obj4]),
+            LiftedAtom(OnNothing, [from_obj4])
+        }
+        add_effects = {
+            LiftedAtom(Adjacent, [robot, to_obj4]),
+            LiftedAtom(Facing, [robot, to_obj4])
+        }
+        delete_effects = {
+            LiftedAtom(Adjacent, [robot, from_obj4]),
+            LiftedAtom(Facing, [robot, from_obj4]),
+        }
+        ignore_effects = set()
+        move_when_facing_one_stack_nsrt = NSRT("MoveWhenFacingOneStack",
+                                               parameters, preconditions,
+                                               add_effects, delete_effects,
+                                               ignore_effects, option,
+                                               option_vars, null_sampler)
+        nsrts.add(move_when_facing_one_stack_nsrt)
+
+        # MoveWhenFacingTwoStack
+        parameters = [robot, to_obj4, from_obj1, from_obj4]
+        option_vars = [robot, to_obj4]
+        option = Move
+        preconditions = {
+            LiftedAtom(Adjacent, [robot, from_obj1]),
+            LiftedAtom(Facing, [robot, from_obj1]),
+            LiftedAtom(Adjacent, [robot, from_obj4]),
+            LiftedAtom(Facing, [robot, from_obj4]),
+            LiftedAtom(Clear, [from_obj1]),
+            LiftedAtom(On, [from_obj1, from_obj4]),
+            LiftedAtom(OnNothing, [from_obj4])
+        }
+        add_effects = {
+            LiftedAtom(Adjacent, [robot, to_obj4]),
+            LiftedAtom(Facing, [robot, to_obj4])
+        }
+        delete_effects = {
+            LiftedAtom(Adjacent, [robot, from_obj1]),
+            LiftedAtom(Facing, [robot, from_obj1]),
+            LiftedAtom(Adjacent, [robot, from_obj4]),
+            LiftedAtom(Facing, [robot, from_obj4])
+        }
+        ignore_effects = set()
+        move_when_facing_two_stack_nsrt = NSRT("MoveWhenFacingTwoStack",
+                                               parameters, preconditions,
+                                               add_effects, delete_effects,
+                                               ignore_effects, option,
+                                               option_vars, null_sampler)
+        nsrts.add(move_when_facing_two_stack_nsrt)
+
+        # MoveWhenFacingThreeStack
+        parameters = [robot, to_obj4, from_obj1, from_obj2, from_obj4]
+        option_vars = [robot, to_obj4]
+        option = Move
+        preconditions = {
+            LiftedAtom(Adjacent, [robot, from_obj1]),
+            LiftedAtom(Facing, [robot, from_obj1]),
+            LiftedAtom(Adjacent, [robot, from_obj2]),
+            LiftedAtom(Facing, [robot, from_obj2]),
+            LiftedAtom(Adjacent, [robot, from_obj4]),
+            LiftedAtom(Facing, [robot, from_obj4]),
+            LiftedAtom(Clear, [from_obj1]),
+            LiftedAtom(On, [from_obj1, from_obj2]),
+            LiftedAtom(On, [from_obj2, from_obj4]),
+            LiftedAtom(OnNothing, [from_obj4])
+        }
+        add_effects = {
+            LiftedAtom(Adjacent, [robot, to_obj4]),
+            LiftedAtom(Facing, [robot, to_obj4])
+        }
+        delete_effects = {
+            LiftedAtom(Adjacent, [robot, from_obj1]),
+            LiftedAtom(Facing, [robot, from_obj1]),
+            LiftedAtom(Adjacent, [robot, from_obj2]),
+            LiftedAtom(Facing, [robot, from_obj2]),
+            LiftedAtom(Adjacent, [robot, from_obj4]),
+            LiftedAtom(Facing, [robot, from_obj4])
+        }
+        ignore_effects = set()
+        move_when_facing_three_stack_nsrt = NSRT("MoveWhenFacingThreeStack",
+                                                 parameters, preconditions,
+                                                 add_effects, delete_effects,
+                                                 ignore_effects, option,
+                                                 option_vars, null_sampler)
+        nsrts.add(move_when_facing_three_stack_nsrt)
+
+        # MoveWhenFacingFourStack
+        parameters = [
+            robot, to_obj4, from_obj1, from_obj2, from_obj3, from_obj4
+        ]
+        option_vars = [robot, to_obj4]
+        option = Move
+        preconditions = {
+            LiftedAtom(Adjacent, [robot, from_obj1]),
+            LiftedAtom(Facing, [robot, from_obj1]),
+            LiftedAtom(Adjacent, [robot, from_obj2]),
+            LiftedAtom(Facing, [robot, from_obj2]),
+            LiftedAtom(Adjacent, [robot, from_obj3]),
+            LiftedAtom(Facing, [robot, from_obj3]),
+            LiftedAtom(Adjacent, [robot, from_obj4]),
+            LiftedAtom(Facing, [robot, from_obj4]),
+            LiftedAtom(Clear, [from_obj1]),
+            LiftedAtom(On, [from_obj1, from_obj2]),
+            LiftedAtom(On, [from_obj2, from_obj3]),
+            LiftedAtom(On, [from_obj3, from_obj4]),
+            LiftedAtom(OnNothing, [from_obj4])
+        }
+        add_effects = {
+            LiftedAtom(Adjacent, [robot, to_obj4]),
+            LiftedAtom(Facing, [robot, to_obj4])
+        }
+        delete_effects = {
+            LiftedAtom(Adjacent, [robot, from_obj1]),
+            LiftedAtom(Facing, [robot, from_obj1]),
+            LiftedAtom(Adjacent, [robot, from_obj2]),
+            LiftedAtom(Facing, [robot, from_obj2]),
+            LiftedAtom(Adjacent, [robot, from_obj3]),
+            LiftedAtom(Facing, [robot, from_obj3]),
+            LiftedAtom(Adjacent, [robot, from_obj4]),
+            LiftedAtom(Facing, [robot, from_obj4])
+        }
+        ignore_effects = set()
+        move_when_facing_four_stack_nsrt = NSRT("MoveWhenFacingFourStack",
+                                                parameters, preconditions,
+                                                add_effects, delete_effects,
+                                                ignore_effects, option,
+                                                option_vars, null_sampler)
+        nsrts.add(move_when_facing_four_stack_nsrt)
+
+        # MoveWhenFacingThreeStack
+        parameters = [robot, to_obj4, from_obj1, from_obj2, from_obj3]
+        option_vars = [robot, to_obj4]
+        option = Move
+        preconditions = {
+            LiftedAtom(Adjacent, [robot, from_obj1]),
+            LiftedAtom(Facing, [robot, from_obj1]),
+            LiftedAtom(Adjacent, [robot, from_obj2]),
+            LiftedAtom(Facing, [robot, from_obj2]),
+            LiftedAtom(Adjacent, [robot, from_obj3]),
+            LiftedAtom(Facing, [robot, from_obj3]),
+            LiftedAtom(Clear, [from_obj1]),
+            LiftedAtom(On, [from_obj1, from_obj2]),
+            LiftedAtom(On, [from_obj2, from_obj3]),
+            LiftedAtom(OnNothing, [from_obj3])
+        }
+        add_effects = {
+            LiftedAtom(Adjacent, [robot, to_obj4]),
+            LiftedAtom(Facing, [robot, to_obj4])
+        }
+        delete_effects = {
+            LiftedAtom(Adjacent, [robot, from_obj1]),
+            LiftedAtom(Facing, [robot, from_obj1]),
+            LiftedAtom(Adjacent, [robot, from_obj2]),
+            LiftedAtom(Facing, [robot, from_obj2]),
+            LiftedAtom(Adjacent, [robot, from_obj3]),
+            LiftedAtom(Facing, [robot, from_obj3])
+        }
+        ignore_effects = set()
+        move_when_facing_three_stack_nsrt = NSRT("MoveWhenFacingThreeStack",
+                                                 parameters, preconditions,
+                                                 add_effects, delete_effects,
+                                                 ignore_effects, option,
+                                                 option_vars, null_sampler)
+        nsrts.add(move_when_facing_three_stack_nsrt)
+
+        # MoveFromOneStackToThreeStack
+        parameters = [robot, to_obj1, to_obj2, to_obj4, from_obj4]
+        option_vars = [robot, to_obj1]
+        option = Move
+        preconditions = {
+            LiftedAtom(Adjacent, [robot, from_obj4]),
+            LiftedAtom(Facing, [robot, from_obj4]),
+            LiftedAtom(Clear, [from_obj4]),
+            LiftedAtom(OnNothing, [from_obj4]),
+            LiftedAtom(Clear, [to_obj1]),
+            LiftedAtom(On, [to_obj1, to_obj2]),
+            LiftedAtom(On, [to_obj2, to_obj4]),
+            LiftedAtom(OnNothing, [to_obj4]),
+        }
+        add_effects = {
+            LiftedAtom(Adjacent, [robot, to_obj1]),
+            LiftedAtom(Facing, [robot, to_obj1]),
+            LiftedAtom(Adjacent, [robot, to_obj2]),
+            LiftedAtom(Facing, [robot, to_obj2]),
+            LiftedAtom(Adjacent, [robot, to_obj4]),
+            LiftedAtom(Facing, [robot, to_obj4]),
+        }
+        delete_effects = {
+            LiftedAtom(Adjacent, [robot, from_obj4]),
+            LiftedAtom(Facing, [robot, from_obj4]),
+        }
+        ignore_effects = set()
+        move_from_one_stack_to_three_stack_nsrt = NSRT(
+            "MoveFromOneStackToThreeStack", parameters, preconditions,
+            add_effects, delete_effects, ignore_effects, option, option_vars,
+            null_sampler)
+        nsrts.add(move_from_one_stack_to_three_stack_nsrt)
+
+        # NOTE: this nsrt will be relevant after the environment is updated to
+        # have more variation in the tasks' initial states.
+        # # MoveWhenNotFacingStart
+        # parameters = [robot, to_obj, from_obj3]
+        # option_vars = [robot, to_obj]
+        # option = Move
+        # preconditions = {
+        #     LiftedAtom(Adjacent, [robot, from_obj3]),
+        #     LiftedAtom(AdjacentNotFacing, [robot, from_obj3])
+        # }
+        # add_effects = {
+        #     LiftedAtom(Adjacent, [robot, to_obj]),
+        #     LiftedAtom(Facing, [robot, to_obj])
+        # }
+        # delete_effects = {
+        #     LiftedAtom(Adjacent, [robot, from_obj3]),
+        #     LiftedAtom(AdjacentNotFacing, [robot, from_obj3])
+        # }
+        # ignore_effects = set()
+        # move_when_not_facing_start_nsrt = NSRT(
+        #     "MoveWhenNotFacingStart",
+        #     parameters,
+        #     preconditions,
+        #     add_effects,
+        #     delete_effects,
+        #     ignore_effects,
+        #     option,
+        #     option_vars,
+        #     null_sampler
+        # )
+        # nsrts.add(move_when_not_facing_start_nsrt)
+
+        # NOTE: this nsrt will be relevant after the environment is updated to
+        # have more variation in the tasks' initial states.
+        # # PickMultipleAdjacent
+        # parameters = [robot, item]
+        # option_vars = [robot, item]
+        # option = Pick
+        # preconditions = {
+        #     LiftedAtom(HandEmpty, [robot]),
+        #     LiftedAtom(Adjacent, [robot, item]),
+        #     LiftedAtom(Facing, [robot, item]),
+        #     LiftedAtom(OnNothing, [item]),
+        #     LiftedAtom(Clear, [item])
+        # }
+        # add_effects = {
+        #     LiftedAtom(Holding, [robot, item])
+        # }
+        # delete_effects = {
+        #     LiftedAtom(HandEmpty, [robot]),
+        #     LiftedAtom(Adjacent, [robot, item]),
+        #     LiftedAtom(Facing, [robot, item])
+        # }
+        # ignore_effects: Set[Predicate] = set()
+        # pick_multiple_adjacent_nsrt = NSRT(
+        #     "PickMultipleAdjacent",
+        #     parameters,
+        #     preconditions,
+        #     add_effects,
+        #     delete_effects,
+        #     ignore_effects,
+        #     option,
+        #     option_vars,
+        #     null_sampler
+        # )
+        # nsrts.add(pick_multiple_adjacent_nsrt)
+
+        # PickSingleAdjacent
+        parameters = [robot, item]
+        option_vars = [robot, item]
+        option = Pick
+        preconditions = {
+            LiftedAtom(HandEmpty, [robot]),
+            LiftedAtom(Adjacent, [robot, item]),
+            LiftedAtom(Facing, [robot, item]),
+            LiftedAtom(OnNothing, [item]),
+            LiftedAtom(Clear, [item])
+        }
+        add_effects = {
+            LiftedAtom(Holding, [robot, item]),
+            LiftedAtom(AdjacentToNothing, [robot])
+        }
+        delete_effects = {
+            LiftedAtom(HandEmpty, [robot]),
+            LiftedAtom(Adjacent, [robot, item]),
+            LiftedAtom(Facing, [robot, item])
+        }
+        ignore_effects: Set[Predicate] = set()  # type: ignore[no-redef]
+        pick_single_adjacent_nsrt = NSRT("PickSingleAdjacent", parameters,
+                                         preconditions, add_effects,
+                                         delete_effects, ignore_effects,
+                                         option, option_vars, null_sampler)
+        nsrts.add(pick_single_adjacent_nsrt)
+
+        # PickFromStack
+        parameters = [robot, item, obj]
+        option_vars = [robot, item]
+        option = Pick
+        preconditions = {
+            LiftedAtom(HandEmpty, [robot]),
+            LiftedAtom(Adjacent, [robot, item]),
+            LiftedAtom(Facing, [robot, item]),
+            LiftedAtom(Adjacent, [robot, obj]),
+            LiftedAtom(Facing, [robot, obj]),
+            LiftedAtom(On, [item, obj]),
+            LiftedAtom(Clear, [item])
+        }
+        add_effects = {
+            LiftedAtom(Holding, [robot, item]),
+            LiftedAtom(Clear, [obj])
+        }
+        delete_effects = {
+            LiftedAtom(HandEmpty, [robot]),
+            LiftedAtom(Adjacent, [robot, item]),
+            LiftedAtom(Facing, [robot, item]),
+            LiftedAtom(On, [item, obj]),
+        }
+        ignore_effects: Set[Predicate] = set()  # type: ignore[no-redef]
+        pick_from_stack_nsrt = NSRT("PickFromStack", parameters, preconditions,
+                                    add_effects, delete_effects,
+                                    ignore_effects, option, option_vars,
+                                    null_sampler)
+        nsrts.add(pick_from_stack_nsrt)
+
+        # Place
+        parameters = [robot, item, obj]
+        option_vars = [robot, item, obj]
+        option = Place
+        preconditions = {
+            LiftedAtom(Holding, [robot, item]),
+            LiftedAtom(Adjacent, [robot, obj]),
+            LiftedAtom(Facing, [robot, obj]),
+            LiftedAtom(Clear, [obj])
+        }
+        add_effects = {
+            LiftedAtom(HandEmpty, [robot]),
+            LiftedAtom(On, [item, obj]),
+            LiftedAtom(Adjacent, [robot, item]),
+            LiftedAtom(Facing, [robot, item])
+        }
+        delete_effects = {
+            LiftedAtom(Holding, [robot, item]),
+            LiftedAtom(OnNothing, [item]),
+            LiftedAtom(Clear, [obj])
+        }
+        ignore_effects: Set[Predicate] = set()  # type: ignore[no-redef]
+        place_nsrt = NSRT("Place", parameters, preconditions, add_effects,
+                          delete_effects, ignore_effects, option, option_vars,
+                          null_sampler)
+        nsrts.add(place_nsrt)
+
+        return nsrts
diff --git a/predicators/ground_truth_models/burger/options.py b/predicators/ground_truth_models/burger/options.py
new file mode 100644
index 0000000000..2a3febe6f2
--- /dev/null
+++ b/predicators/ground_truth_models/burger/options.py
@@ -0,0 +1,233 @@
+"""Ground-truth options for the burger environment."""
+
+from typing import Dict, Iterator, Sequence, Set, Tuple
+
+import numpy as np
+from gym.spaces import Box
+
+from predicators import utils
+from predicators.envs.burger import BurgerEnv
+from predicators.ground_truth_models import GroundTruthOptionFactory
+from predicators.structs import Action, Array, Object, ParameterizedOption, \
+    ParameterizedPolicy, Predicate, State, Type
+
+
+class BurgerGroundTruthOptionFactory(GroundTruthOptionFactory):
+    """Ground-truth options for the burger environment."""
+
+    @classmethod
+    def get_env_names(cls) -> Set[str]:
+        return {"burger"}
+
+    @classmethod
+    def get_options(cls, env_name: str, types: Dict[str, Type],
+                    predicates: Dict[str, Predicate],
+                    action_space: Box) -> Set[ParameterizedOption]:
+
+        # Types
+        # top_bun_type = types["top_bun"]
+        # bottom_bun_type = types["bottom_bun"]
+        # cheese_type = types["cheese"]
+        tomato_type = types["tomato"]
+        patty_type = types["patty"]
+
+        grill_type = types["grill"]
+        cutting_board_type = types["cutting_board"]
+        robot_type = types["robot"]
+
+        item_type = types["item"]
+        # station_type = types["station"]
+        object_type = types["object"]
+
+        # Predicates
+        # Adjacent = predicates["Adjacent"]
+        # AdjacentToNothing = predicates["AdjacentToNothing"]
+        Facing = predicates["Facing"]
+        # AdjacentNotFacing = predicates["AdjacentNotFacing"]
+        IsCooked = predicates["IsCooked"]
+        IsSliced = predicates["IsSliced"]
+        HandEmpty = predicates["HandEmpty"]
+        Holding = predicates["Holding"]
+        On = predicates["On"]
+
+        # GoalHack = predicates["GoalHack"]
+
+        # Slice
+        def _Slice_terminal(state: State, memory: Dict,
+                            objects: Sequence[Object], params: Array) -> bool:
+            del memory, params  # unused
+            _, tomato, _ = objects
+            return IsSliced.holds(state, [tomato])
+
+        Slice = ParameterizedOption(
+            "Slice",
+            types=[robot_type, tomato_type, cutting_board_type],
+            params_space=Box(0, 1, (0, )),
+            policy=cls._create_slice_policy(),
+            initiable=lambda s, m, o, p: True,
+            terminal=_Slice_terminal)
+
+        # Cook
+        def _Cook_terminal(state: State, memory: Dict,
+                           objects: Sequence[Object], params: Array) -> bool:
+            del memory, params  # unused
+            _, patty, _ = objects
+            return IsCooked.holds(state, [patty])
+
+        Cook = ParameterizedOption("Cook",
+                                   types=[robot_type, patty_type, grill_type],
+                                   params_space=Box(0, 1, (0, )),
+                                   policy=cls._create_cook_policy(),
+                                   initiable=lambda s, m, o, p: True,
+                                   terminal=_Cook_terminal)
+
+        # Move
+        def _Move_terminal(state: State, memory: Dict,
+                           objects: Sequence[Object], params: Array) -> bool:
+            del memory, params  # unused
+            robot, to_obj = objects
+            return Facing.holds(state, [robot, to_obj])
+
+        Move = ParameterizedOption("Move",
+                                   types=[robot_type, object_type],
+                                   params_space=Box(0, 1, (0, )),
+                                   policy=cls._create_move_policy(),
+                                   initiable=lambda s, m, o, p: True,
+                                   terminal=_Move_terminal)
+
+        # Pick
+        def _Pick_terminal(state: State, memory: Dict,
+                           objects: Sequence[Object], params: Array) -> bool:
+            del memory, params  # unused
+            robot, item = objects
+            return Holding.holds(state, [robot, item])
+
+        Pick = ParameterizedOption("Pick",
+                                   types=[robot_type, item_type],
+                                   params_space=Box(0, 1, (0, )),
+                                   policy=cls._create_pickplace_policy(),
+                                   initiable=lambda s, m, o, p: True,
+                                   terminal=_Pick_terminal)
+
+        # Place
+        def _Place_terminal(state: State, memory: Dict,
+                            objects: Sequence[Object], params: Array) -> bool:
+            del memory, params  # unused
+            robot, item, obj = objects
+            return HandEmpty.holds(state, [robot]) and On.holds(
+                state, [item, obj])
+
+        Place = ParameterizedOption("Place",
+                                    types=[robot_type, item_type, object_type],
+                                    params_space=Box(0, 1, (0, )),
+                                    policy=cls._create_pickplace_policy(),
+                                    initiable=lambda s, m, o, p: True,
+                                    terminal=_Place_terminal)
+
+        return {Move, Pick, Place, Cook, Slice}
+
+    @classmethod
+    def _create_slice_policy(cls) -> ParameterizedPolicy:
+
+        def policy(state: State, memory: Dict, objects: Sequence[Object],
+                   params: Array) -> Action:
+            del state, memory, objects, params  # unused
+            action = Action(np.array([0, 0, -1, 1, 0], dtype=np.float32))
+            return action
+
+        return policy
+
+    @classmethod
+    def _create_cook_policy(cls) -> ParameterizedPolicy:
+
+        def policy(state: State, memory: Dict, objects: Sequence[Object],
+                   params: Array) -> Action:
+            del state, memory, objects, params  # unused
+            action = Action(np.array([0, 0, -1, 1, 0], dtype=np.float32))
+            return action
+
+        return policy
+
+    @classmethod
+    def _create_move_policy(cls) -> ParameterizedPolicy:
+
+        def policy(state: State, memory: Dict, objects: Sequence[Object],
+                   params: Array) -> Action:
+            del memory, params  # unused
+            robot, to_obj = objects
+            rx, ry = BurgerEnv.get_position(robot, state)
+            ox, oy = BurgerEnv.get_position(to_obj, state)
+
+            # If we're adjacent to the object but not facing it, turn to face
+            # it.
+            if BurgerEnv.Adjacent_holds(state, [robot, to_obj]) and \
+                not BurgerEnv.Facing_holds(state, [robot, to_obj]):
+                if rx == ox:
+                    if ry > oy:
+                        action = Action(
+                            np.array([0, 0, 2, 0, 0], dtype=np.float32))
+                    elif ry < oy:
+                        action = Action(
+                            np.array([0, 0, 0, 0, 0], dtype=np.float32))
+                elif ry == oy:
+                    if rx > ox:
+                        action = Action(
+                            np.array([0, 0, 1, 0, 0], dtype=np.float32))
+                    elif rx < ox:
+                        action = Action(
+                            np.array([0, 0, 3, 0, 0], dtype=np.float32))
+
+            else:
+                # Find the path we need to take to the object.
+                init = BurgerEnv.get_position(robot, state)
+
+                def _check_goal(s: Tuple[int, int]) -> bool:
+                    sx, sy = s
+                    if BurgerEnv.is_adjacent(sx, sy, ox, oy):
+                        return True
+                    return False
+
+                def _get_successors(s: Tuple[int, int]) -> \
+                    Iterator[Tuple[None, Tuple[int, int], float]]:
+                    # Find the adjacent cells that are empty.
+                    empty_cells = BurgerEnv.get_empty_cells(state)
+                    sx, sy = s
+                    adjacent_empty = []
+                    for cell in empty_cells:
+                        cx, cy = cell
+                        if BurgerEnv.is_adjacent(sx, sy, cx, cy):
+                            adjacent_empty.append(cell)
+                    for cell in adjacent_empty:
+                        yield (None, cell, 1.0)
+
+                def heuristic(s: Tuple[int, int]) -> float:
+                    sx, sy = s
+                    return abs(sx - ox) + abs(sy - oy)
+
+                path, _ = utils.run_astar(initial_state=init,
+                                          check_goal=_check_goal,
+                                          get_successors=_get_successors,
+                                          heuristic=heuristic)
+
+                # Now, compute the action to take based on the path we have
+                # planned. Note that the path is a list of (x, y) tuples
+                # starting from the location of the robot.
+                nx, ny = path[1]
+                dx = np.clip(nx - rx, -1, 1)
+                dy = np.clip(ny - ry, -1, 1)
+                action = Action(np.array([dx, dy, -1, 0, 0], dtype=np.float32))
+
+            return action
+
+        return policy
+
+    @classmethod
+    def _create_pickplace_policy(cls) -> ParameterizedPolicy:
+
+        def policy(state: State, memory: Dict, objects: Sequence[Object],
+                   params: Array) -> Action:
+            del state, memory, objects, params  # unused
+            action = Action(np.array([0, 0, -1, 0, 1], dtype=np.float32))
+            return action
+
+        return policy
diff --git a/predicators/ground_truth_models/ice_tea_making/__init__.py b/predicators/ground_truth_models/ice_tea_making/__init__.py
index f12182c3e2..763cb4486f 100644
--- a/predicators/ground_truth_models/ice_tea_making/__init__.py
+++ b/predicators/ground_truth_models/ice_tea_making/__init__.py
@@ -1,4 +1,4 @@
-"""Ground-truth models for blocks environment and variants."""
+"""Ground-truth models for ice tea making environment."""
 
 from .nsrts import TeaMakingGroundTruthNSRTFactory
 from .options import TeaMakingGroundTruthOptionFactory
diff --git a/predicators/ground_truth_models/ice_tea_making/nsrts.py b/predicators/ground_truth_models/ice_tea_making/nsrts.py
index ee0f6a60fa..9adf35195a 100644
--- a/predicators/ground_truth_models/ice_tea_making/nsrts.py
+++ b/predicators/ground_truth_models/ice_tea_making/nsrts.py
@@ -1,4 +1,4 @@
-"""Ground-truth NSRTs for the blocks environment."""
+"""Ground-truth NSRTs for the tea making environment."""
 
 from typing import Dict, Set
 
@@ -7,10 +7,10 @@
 
 
 class TeaMakingGroundTruthNSRTFactory(GroundTruthNSRTFactory):
-    """Ground-truth NSRTs for the apple_coring environment."""
+    """Ground-truth NSRTs for the tea making environment."""
 
     @classmethod
-    def get_env_names(cls) -> Set[str]:
+    def get_env_names(cls) -> Set[str]:  # pragma: no cover
         return {"ice_tea_making"}
 
     @staticmethod
diff --git a/predicators/ground_truth_models/ice_tea_making/options.py b/predicators/ground_truth_models/ice_tea_making/options.py
index 69c65b8dda..8a845949eb 100644
--- a/predicators/ground_truth_models/ice_tea_making/options.py
+++ b/predicators/ground_truth_models/ice_tea_making/options.py
@@ -1,4 +1,4 @@
-"""Ground-truth options for the (non-pybullet) blocks environment."""
+"""Ground-truth options for the ice tea making environment."""
 
 from typing import Dict, Sequence, Set
 
diff --git a/predicators/ground_truth_models/kitchen/nsrts.py b/predicators/ground_truth_models/kitchen/nsrts.py
index 55c4c230fc..397897e21b 100644
--- a/predicators/ground_truth_models/kitchen/nsrts.py
+++ b/predicators/ground_truth_models/kitchen/nsrts.py
@@ -36,7 +36,8 @@ def get_nsrts(env_name: str, types: Dict[str, Type],
         gripper = Variable("?gripper", gripper_type)
         on_off_obj = Variable("?on_off_obj", on_off_type)
         kettle = Variable("?kettle", kettle_type)
-        surface = Variable("?surface", surface_type)
+        surface_from = Variable("?surface_from", surface_type)
+        surface_to = Variable("?surface_to", surface_type)
         switch = Variable("?switch", switch_type)
         knob = Variable("?knob", knob_type)
         hinge_door = Variable("?hinge_door", hinge_door_type)
@@ -66,6 +67,10 @@ def get_nsrts(env_name: str, types: Dict[str, Type],
         OnTop = predicates["OnTop"]
         Open = predicates["Open"]
         NotOnTop = predicates["NotOnTop"]
+        BurnerAhead = predicates["BurnerAhead"]
+        BurnerBehdind = predicates["BurnerBehind"]
+        KettleBoiling = predicates["KettleBoiling"]
+        KnobAndBurnerLinked = predicates["KnobAndBurnerLinked"]
 
         nsrts = set()
 
@@ -182,18 +187,20 @@ def moveto_prepullkettle_sampler(state: State, goal: Set[GroundAtom],
         nsrts.add(move_to_pre_pull_kettle_nsrt)
 
         # PushObjOnObjForward
-        parameters = [gripper, kettle, surface]
+        parameters = [gripper, kettle, surface_from, surface_to]
         preconditions = {
             LiftedAtom(AtPrePushOnTop, [gripper, kettle]),
-            LiftedAtom(NotOnTop, [kettle, surface])
+            LiftedAtom(NotOnTop, [kettle, surface_to]),
+            LiftedAtom(BurnerAhead, [surface_to, surface_from]),
+            LiftedAtom(OnTop, [kettle, surface_from]),
         }
-        add_effects = {LiftedAtom(OnTop, [kettle, surface])}
-        delete_effects = {LiftedAtom(NotOnTop, [kettle, surface])}
+        add_effects = {LiftedAtom(OnTop, [kettle, surface_to])}
+        delete_effects = {LiftedAtom(NotOnTop, [kettle, surface_to])}
         ignore_effects = {
             AtPreTurnOn, AtPrePushOnTop, AtPreTurnOff, AtPrePullKettle
         }
         option = PushObjOnObjForward
-        option_vars = [gripper, kettle, surface]
+        option_vars = [gripper, kettle, surface_to]
 
         def push_obj_on_obj_forward_sampler(state: State,
                                             goal: Set[GroundAtom],
@@ -214,19 +221,49 @@ def push_obj_on_obj_forward_sampler(state: State,
                                             push_obj_on_obj_forward_sampler)
         nsrts.add(push_obj_on_obj_forward_nsrt)
 
+        # PushObjOnObjForwardToBoilKettle
+        parameters = [gripper, kettle, surface_from, surface_to, knob]
+        preconditions = {
+            LiftedAtom(AtPrePushOnTop, [gripper, kettle]),
+            LiftedAtom(NotOnTop, [kettle, surface_to]),
+            LiftedAtom(BurnerAhead, [surface_to, surface_from]),
+            LiftedAtom(OnTop, [kettle, surface_from]),
+            LiftedAtom(TurnedOn, [knob]),
+            LiftedAtom(KnobAndBurnerLinked, [knob, surface_to])
+        }
+        add_effects = {
+            LiftedAtom(OnTop, [kettle, surface_to]),
+            LiftedAtom(KettleBoiling, [kettle, surface_to, knob])
+        }
+        delete_effects = {LiftedAtom(NotOnTop, [kettle, surface_to])}
+        ignore_effects = {
+            AtPreTurnOn, AtPrePushOnTop, AtPreTurnOff, AtPrePullKettle
+        }
+        option = PushObjOnObjForward
+        option_vars = [gripper, kettle, surface_to]
+        push_obj_on_obj_forward_nsrt = NSRT("PushObjOnObjForwardAndBoilKettle",
+                                            parameters, preconditions,
+                                            add_effects, delete_effects,
+                                            ignore_effects, option,
+                                            option_vars,
+                                            push_obj_on_obj_forward_sampler)
+        nsrts.add(push_obj_on_obj_forward_nsrt)
+
         # PullKettle
-        parameters = [gripper, kettle, surface]
+        parameters = [gripper, kettle, surface_from, surface_to]
         preconditions = {
             LiftedAtom(AtPrePullKettle, [gripper, kettle]),
-            LiftedAtom(NotOnTop, [kettle, surface])
+            LiftedAtom(NotOnTop, [kettle, surface_to]),
+            LiftedAtom(BurnerBehdind, [surface_to, surface_from]),
+            LiftedAtom(OnTop, [kettle, surface_from]),
         }
-        add_effects = {LiftedAtom(OnTop, [kettle, surface])}
+        add_effects = {LiftedAtom(OnTop, [kettle, surface_to])}
         delete_effects = {LiftedAtom(AtPrePullKettle, [gripper, kettle])}
         ignore_effects = {
             AtPreTurnOn, AtPrePushOnTop, AtPreTurnOff, AtPrePullKettle
         }
         option = PullKettle
-        option_vars = [gripper, kettle, surface]
+        option_vars = [gripper, kettle, surface_to]
 
         def pull_kettle_sampler(state: State, goal: Set[GroundAtom],
                                 rng: np.random.Generator,
@@ -328,6 +365,30 @@ def knob_turn_on_sampler(state: State, goal: Set[GroundAtom],
                                  option, option_vars, knob_turn_on_sampler)
         nsrts.add(turn_on_knob_nsrt)
 
+        # TurnOnKnobAndBoilKettle
+        parameters = [gripper, knob, surface_to, kettle]
+        preconditions = {
+            LiftedAtom(AtPreTurnOn, [gripper, knob]),
+            LiftedAtom(TurnedOff, [knob]),
+            LiftedAtom(OnTop, [kettle, surface_to]),
+            LiftedAtom(KnobAndBurnerLinked, [knob, surface_to])
+        }
+        add_effects = {
+            LiftedAtom(TurnedOn, [knob]),
+            LiftedAtom(KettleBoiling, [kettle, surface_to, knob])
+        }
+        delete_effects = {LiftedAtom(TurnedOff, [knob])}
+        ignore_effects = {
+            AtPreTurnOn, AtPrePushOnTop, AtPreTurnOff, AtPrePullKettle
+        }
+        option = TurnOnKnob
+        option_vars = [gripper, knob]
+        turn_on_knob_nsrt = NSRT("TurnOnKnobAndBoilKettle", parameters,
+                                 preconditions, add_effects, delete_effects,
+                                 ignore_effects, option, option_vars,
+                                 knob_turn_on_sampler)
+        nsrts.add(turn_on_knob_nsrt)
+
         # TurnOffKnob
         parameters = [gripper, knob]
         preconditions = {
diff --git a/predicators/ground_truth_models/kitchen/options.py b/predicators/ground_truth_models/kitchen/options.py
index fadddfb435..d1175f559c 100644
--- a/predicators/ground_truth_models/kitchen/options.py
+++ b/predicators/ground_truth_models/kitchen/options.py
@@ -250,11 +250,16 @@ def _PushObjOnObjForward_terminal(state: State, memory: Dict,
             if gripper_y - obj_y > 2 * cls.moveto_tol or \
                gripper_y - obj2_y > 2 * cls.moveto_tol:
                 return True
-            if not GroundAtom(OnTop, [obj, obj2]).holds(state):
-                return False
-            # Stronger check to deal with case where push release leads object
-            # to be no longer OnTop.
-            return obj_y > obj2_y - cls.moveto_tol / 4.0
+            # NOTE: this stronger check was necessary at some point to deal
+            # with a subtle case where this action pushes the kettle off
+            # the burner when it ends. However, this stronger check often
+            # doesn't terminate when the goal is set to pushing the kettle
+            # onto a particular burner. So now, we just terminate
+            # when the action's symbolic effects hold; we might have to
+            # reinstate/incorporate this stronger check later if the issue
+            # starts cropping up again.
+            # return obj_y > obj2_y - cls.moveto_tol / 4.0
+            return GroundAtom(OnTop, [obj, obj2]).holds(state)
 
         PushObjOnObjForward = ParameterizedOption(
             "PushObjOnObjForward",
diff --git a/predicators/main.py b/predicators/main.py
index 06e7226a81..e411d3a698 100644
--- a/predicators/main.py
+++ b/predicators/main.py
@@ -40,15 +40,14 @@
 import time
 from collections import defaultdict
 from pathlib import Path
-from typing import List, Optional, Sequence, Set, Tuple
-from typing import Type as TypingType
+from typing import List, Optional, Sequence, Tuple
 
 import dill as pkl
 
 from predicators import utils
 from predicators.approaches import ApproachFailure, ApproachTimeout, \
     create_approach
-from predicators.cogman import CogMan
+from predicators.cogman import CogMan, run_episode_and_get_observations
 from predicators.datasets import create_dataset
 from predicators.envs import BaseEnv, create_new_env
 from predicators.execution_monitoring import create_execution_monitor
@@ -56,8 +55,8 @@
     parse_config_included_options
 from predicators.perception import create_perceiver
 from predicators.settings import CFG, get_allowed_query_type_names
-from predicators.structs import Action, Dataset, InteractionRequest, \
-    InteractionResult, Metrics, Observation, Response, Task, Video, _Option
+from predicators.structs import Dataset, InteractionRequest, \
+    InteractionResult, Metrics, Response, Task, Video
 from predicators.teacher import Teacher, TeacherInteractionMonitorWithVideo
 
 assert os.environ.get("PYTHONHASHSEED") == "0", \
@@ -133,7 +132,7 @@ def main() -> None:
         # Create the offline dataset. Note that this needs to be done using
         # the non-stripped train tasks because dataset generation may need
         # to use the oracle predicates (e.g. demo data generation).
-        offline_dataset = create_dataset(env, train_tasks, options)
+        offline_dataset = create_dataset(env, train_tasks, options, preds)
     else:
         offline_dataset = None
     # Create the cognitive manager.
@@ -278,7 +277,7 @@ def _generate_interaction_results(
         cogman.set_termination_function(request.termination_function)
         env_task = env.get_train_tasks()[request.train_task_idx]
         cogman.reset(env_task)
-        observed_traj, _, _ = _run_episode(
+        observed_traj, _, _ = run_episode_and_get_observations(
             cogman,
             env,
             "train",
@@ -334,9 +333,10 @@ def _run_testing(env: BaseEnv, cogman: CogMan) -> Metrics:
     for test_task_idx, env_task in enumerate(test_tasks):
         solve_start = time.perf_counter()
         try:
-            # We call reset here, outside of run_episode, so that we can log
-            # planning failures, timeouts, etc. This is mostly for legacy
-            # reasons (before cogman existed separately from approaches).
+            # We call reset here, outside of run_episode_and_get_observations,
+            # so that we can log planning failures, timeouts, etc. This is
+            # mostly for legacy reasons (before cogman existed separately
+            # from approaches).
             cogman.reset(env_task)
         except (ApproachTimeout, ApproachFailure) as e:
             logging.info(f"Task {test_task_idx+1} / {len(test_tasks)}: "
@@ -375,7 +375,7 @@ def _run_testing(env: BaseEnv, cogman: CogMan) -> Metrics:
             monitor = None
         try:
             # Now, measure success by running the policy in the environment.
-            traj, solved, execution_metrics = _run_episode(
+            traj, solved, execution_metrics = run_episode_and_get_observations(
                 cogman,
                 env,
                 "test",
@@ -466,91 +466,6 @@ def _run_testing(env: BaseEnv, cogman: CogMan) -> Metrics:
     return metrics
 
 
-def _run_episode(
-    cogman: CogMan,
-    env: BaseEnv,
-    train_or_test: str,
-    task_idx: int,
-    max_num_steps: int,
-    do_env_reset: bool = True,
-    terminate_on_goal_reached: bool = True,
-    exceptions_to_break_on: Optional[Set[TypingType[Exception]]] = None,
-    monitor: Optional[utils.LoggingMonitor] = None
-) -> Tuple[Tuple[List[Observation], List[Action]], bool, Metrics]:
-    """Execute cogman starting from the initial state of a train or test task
-    in the environment.
-
-    Note that the environment and cogman internal states are updated.
-
-    Terminates when any of these conditions hold:
-    (1) cogman.step returns None, indicating termination
-    (2) max_num_steps is reached
-    (3) cogman or env raise an exception of type in exceptions_to_break_on
-    (4) terminate_on_goal_reached is True and the env goal is reached.
-
-    Note that in the case where the exception is raised in step, we exclude the
-    last action from the returned trajectory to maintain the invariant that
-    the trajectory states are of length one greater than the actions.
-
-    This is defined here mostly to avoid circular import issues for cogman.
-    We may want to move it eventually.
-    """
-    if do_env_reset:
-        env.reset(train_or_test, task_idx)
-        if monitor is not None:
-            monitor.reset(train_or_test, task_idx)
-    obs = env.get_observation()
-    observations = [obs]
-    actions: List[Action] = []
-    curr_option: Optional[_Option] = None
-    metrics: Metrics = defaultdict(float)
-    metrics["policy_call_time"] = 0.0
-    metrics["num_options_executed"] = 0.0
-    exception_raised_in_step = False
-    if not (terminate_on_goal_reached and env.goal_reached()):
-        for _ in range(max_num_steps):
-            monitor_observed = False
-            exception_raised_in_step = False
-            try:
-                start_time = time.perf_counter()
-                act = cogman.step(obs)
-                metrics["policy_call_time"] += time.perf_counter() - start_time
-                if act is None:
-                    break
-                if act.has_option() and act.get_option() != curr_option:
-                    curr_option = act.get_option()
-                    metrics["num_options_executed"] += 1
-                # Note: it's important to call monitor.observe() before
-                # env.step(), because the monitor may, for example, call
-                # env.render(), which outputs images of the current env
-                # state. If we instead called env.step() first, we would
-                # mistakenly record images of the next time step instead of
-                # the current one.
-                if monitor is not None:
-                    monitor.observe(obs, act)
-                    monitor_observed = True
-                obs = env.step(act)
-                actions.append(act)
-                observations.append(obs)
-            except Exception as e:
-                if exceptions_to_break_on is not None and \
-                   any(issubclass(type(e), c) for c in exceptions_to_break_on):
-                    if monitor_observed:
-                        exception_raised_in_step = True
-                    break
-                if monitor is not None and not monitor_observed:
-                    monitor.observe(obs, None)
-                raise e
-            if terminate_on_goal_reached and env.goal_reached():
-                break
-    if monitor is not None and not exception_raised_in_step:
-        monitor.observe(obs, None)
-    cogman.finish_episode(obs)
-    traj = (observations, actions)
-    solved = env.goal_reached()
-    return traj, solved, metrics
-
-
 def _save_test_results(results: Metrics,
                        online_learning_cycle: Optional[int]) -> None:
     num_solved = results["num_solved"]
diff --git a/predicators/nsrt_learning/sampler_learning.py b/predicators/nsrt_learning/sampler_learning.py
index 35768006cb..f11a7be754 100644
--- a/predicators/nsrt_learning/sampler_learning.py
+++ b/predicators/nsrt_learning/sampler_learning.py
@@ -15,7 +15,8 @@
 from predicators.settings import CFG
 from predicators.structs import NSRT, Array, Datastore, EntToEntSub, \
     GroundAtom, LiftedAtom, NSRTSampler, Object, OptionSpec, \
-    ParameterizedOption, SamplerDatapoint, State, STRIPSOperator, Variable
+    ParameterizedOption, SamplerDatapoint, State, STRIPSOperator, Variable, \
+    VLMPredicate
 
 
 def learn_samplers(strips_ops: List[STRIPSOperator],
@@ -246,9 +247,12 @@ def _create_sampler_data(
             goal = segment.get_goal()
         else:
             goal = None
+        # We omit VLMPredicates from this below check because it's too
+        # expensive to have to evaluate these as well.
         assert all(
             pre.predicate.holds(state, [var_to_obj[v] for v in pre.variables])
-            for pre in preconditions)
+            for pre in preconditions
+            if not isinstance(pre.predicate, VLMPredicate))
         positive_data.append((state, var_to_obj, option, goal))
 
     # Populate all negative data.
diff --git a/predicators/nsrt_learning/strips_learning/clustering_learner.py b/predicators/nsrt_learning/strips_learning/clustering_learner.py
index 3a331c8353..fb0eae8dbe 100644
--- a/predicators/nsrt_learning/strips_learning/clustering_learner.py
+++ b/predicators/nsrt_learning/strips_learning/clustering_learner.py
@@ -3,13 +3,14 @@
 import abc
 import functools
 import logging
+from collections import defaultdict
 from typing import Dict, FrozenSet, Iterator, List, Set, Tuple, cast
 
 from predicators import utils
 from predicators.nsrt_learning.strips_learning import BaseSTRIPSLearner
 from predicators.settings import CFG
 from predicators.structs import PNAD, Datastore, DummyOption, LiftedAtom, \
-    Predicate, STRIPSOperator, VarToObjSub
+    ParameterizedOption, Predicate, STRIPSOperator, VarToObjSub
 
 
 class ClusteringSTRIPSLearner(BaseSTRIPSLearner):
@@ -127,6 +128,29 @@ def _learn_pnad_preconditions(self, pnads: List[PNAD]) -> List[PNAD]:
     def get_name(cls) -> str:
         return "cluster_and_intersect"
 
+    def _postprocessing_learn_ignore_effects(self,
+                                             pnads: List[PNAD]) -> List[PNAD]:
+        """Prune PNADs whose datastores are too small.
+
+        Specifically, keep PNADs that have at least
+        CFG.cluster_and_intersect_min_datastore_fraction fraction of the
+        segments produced by the option in their NSRT.
+        """
+        if not CFG.cluster_and_intersect_prune_low_data_pnads:
+            return pnads
+        option_to_dataset_size: Dict[ParameterizedOption,
+                                     int] = defaultdict(int)
+        for pnad in pnads:
+            option = pnad.option_spec[0]
+            option_to_dataset_size[option] += len(pnad.datastore)
+        ret_pnads: List[PNAD] = []
+        for pnad in pnads:
+            option = pnad.option_spec[0]
+            fraction = len(pnad.datastore) / option_to_dataset_size[option]
+            if fraction >= CFG.cluster_and_intersect_min_datastore_fraction:
+                ret_pnads.append(pnad)
+        return ret_pnads
+
 
 class ClusterAndSearchSTRIPSLearner(ClusteringSTRIPSLearner):
     """A clustering STRIPS learner that learns preconditions via search,
diff --git a/predicators/perception/kitchen_perceiver.py b/predicators/perception/kitchen_perceiver.py
index 0aa179b90b..6637ceaf1f 100644
--- a/predicators/perception/kitchen_perceiver.py
+++ b/predicators/perception/kitchen_perceiver.py
@@ -18,6 +18,7 @@ def reset(self, env_task: EnvironmentTask) -> Task:
         pred_name_to_pred = KitchenEnv.create_predicates()
         OnTop = pred_name_to_pred["OnTop"]
         TurnedOn = pred_name_to_pred["TurnedOn"]
+        KettleBoiling = pred_name_to_pred["KettleBoiling"]
         kettle = KitchenEnv.object_name_to_object("kettle")
         knob4 = KitchenEnv.object_name_to_object("knob4")
         burner4 = KitchenEnv.object_name_to_object("burner4")
@@ -40,6 +41,8 @@ def reset(self, env_task: EnvironmentTask) -> Task:
             goal = {
                 GroundAtom(TurnedOn, [light]),
             }
+        elif goal_desc == "Move the kettle to the back burner and turn it on":
+            goal = {GroundAtom(KettleBoiling, [kettle, burner4, knob4])}
         else:
             raise NotImplementedError(f"Unrecognized goal: {goal_desc}")
         return Task(state, goal)
@@ -48,7 +51,9 @@ def step(self, observation: Observation) -> State:
         return self._observation_to_state(observation)
 
     def _observation_to_state(self, obs: Observation) -> State:
-        return KitchenEnv.state_info_to_state(obs["state_info"])
+        state = KitchenEnv.state_info_to_state(obs["state_info"])
+        state.simulator_state = obs["obs_images"]
+        return state
 
     def render_mental_images(self, observation: Observation,
                              env_task: EnvironmentTask) -> Video:
diff --git a/predicators/predicate_search_score_functions.py b/predicators/predicate_search_score_functions.py
index 709020345c..c4f8a24547 100644
--- a/predicators/predicate_search_score_functions.py
+++ b/predicators/predicate_search_score_functions.py
@@ -155,6 +155,13 @@ def evaluate(self, candidate_predicates: FrozenSet[Predicate]) -> float:
             logging.info(
                 "Warning: Operator Learning timed out! Skipping evaluation.")
             return float('inf')
+
+        logging.debug(
+            f"Learned {len(pnads)} operators for this predicate set.")
+        for pnad in pnads:
+            logging.debug(
+                f"Operator {pnad.op.name} has {len(pnad.datastore)} datapoints."
+            )
         strips_ops = [pnad.op for pnad in pnads]
         option_specs = [pnad.option_spec for pnad in pnads]
         op_score = self.evaluate_with_operators(candidate_predicates,
diff --git a/predicators/pretrained_model_interface.py b/predicators/pretrained_model_interface.py
index 8bee546259..2fb5079835 100644
--- a/predicators/pretrained_model_interface.py
+++ b/predicators/pretrained_model_interface.py
@@ -5,16 +5,17 @@
 """
 
 import abc
+import base64
 import logging
 import os
-import time
-from typing import List, Optional
+from io import BytesIO
+from typing import Collection, Dict, List, Optional, Union
 
-import google
 import google.generativeai as genai
 import imagehash
 import openai
 import PIL.Image
+from tenacity import retry, stop_after_attempt, wait_random_exponential
 
 from predicators.settings import CFG
 
@@ -74,7 +75,7 @@ def sample_completions(self,
         model_id = self.get_id()
         prompt_id = hash(prompt)
         config_id = f"{temperature}_{seed}_{num_completions}_" + \
-                f"{stop_token}"
+                    f"{stop_token}"
         # If the temperature is 0, the seed does not matter.
         if temperature == 0.0:
             config_id = f"most_likely_{num_completions}_{stop_token}"
@@ -159,23 +160,56 @@ def sample_completions(
                                           stop_token, num_completions)
 
 
-class OpenAILLM(LargeLanguageModel):
-    """Interface to openAI LLMs (GPT-3).
+class OpenAIModel():
+    """Common interface with methods for all OpenAI-based models."""
+
+    def set_openai_key(self, key: Optional[str] = None) -> None:
+        """Set the OpenAI API key."""
+        if key is None:
+            assert "OPENAI_API_KEY" in os.environ
+            key = os.environ["OPENAI_API_KEY"]
+
+    @retry(wait=wait_random_exponential(min=1, max=60),
+           stop=stop_after_attempt(10))
+    def call_openai_api(self,
+                        messages: list,
+                        model: str = "gpt-4",
+                        seed: Optional[int] = None,
+                        max_tokens: int = 32,
+                        temperature: float = 0.2,
+                        verbose: bool = False) -> str:  # pragma: no cover
+        """Make an API call to OpenAI."""
+        client = openai.OpenAI()
+        completion = client.chat.completions.create(
+            model=model,
+            messages=messages,
+            seed=seed,
+            max_tokens=max_tokens,
+            temperature=temperature,
+        )
+        if verbose:
+            logging.debug(f"OpenAI API response: {completion}")
+        assert len(completion.choices) == 1
+        assert completion.choices[0].message.content is not None
+        return completion.choices[0].message.content
+
+
+class OpenAILLM(LargeLanguageModel, OpenAIModel):
+    """Interface to openAI LLMs.
 
     Assumes that an environment variable OPENAI_API_KEY is set to a
     private API key for beta.openai.com.
     """
 
     def __init__(self, model_name: str) -> None:
-        """See https://beta.openai.com/docs/models/gpt-3 for the list of
+        """See https://platform.openai.com/docs/models for the list of
         available model names."""
         self._model_name = model_name
         # Note that max_tokens is the maximum response length (not prompt).
         # From OpenAI docs: "The token count of your prompt plus max_tokens
         # cannot exceed the model's context length."
         self._max_tokens = CFG.llm_openai_max_response_tokens
-        assert "OPENAI_API_KEY" in os.environ
-        openai.api_key = os.getenv("OPENAI_API_KEY")
+        self.set_openai_key()
 
     def get_id(self) -> str:
         return f"openai-{self._model_name}"
@@ -188,19 +222,15 @@ def _sample_completions(
             seed: int,
             stop_token: Optional[str] = None,
             num_completions: int = 1) -> List[str]:  # pragma: no cover
-        del imgs, seed  # unused
-        response = openai.Completion.create(
-            model=self._model_name,  # type: ignore
-            prompt=prompt,
-            temperature=temperature,
-            max_tokens=self._max_tokens,
-            stop=stop_token,
-            n=num_completions)
-        assert len(response["choices"]) == num_completions
-        text_responses = [
-            response["choices"][i]["text"] for i in range(num_completions)
+        del imgs, seed, stop_token  # unused
+        messages = [{"text": prompt, "type": "text"}]
+        responses = [
+            self.call_openai_api(messages,
+                                 model=self._model_name,
+                                 temperature=temperature)
+            for _ in range(num_completions)
         ]
-        return text_responses
+        return responses
 
 
 class GoogleGeminiVLM(VisionLanguageModel):
@@ -221,6 +251,8 @@ def __init__(self, model_name: str) -> None:
     def get_id(self) -> str:
         return f"Google-{self._model_name}"
 
+    @retry(wait=wait_random_exponential(min=1, max=60),
+           stop=stop_after_attempt(10))
     def _sample_completions(
             self,
             prompt: str,
@@ -234,18 +266,89 @@ def _sample_completions(
         generation_config = genai.types.GenerationConfig(  # pylint:disable=no-member
             candidate_count=num_completions,
             temperature=temperature)
-        response = None
-        while response is None:
-            try:
-                response = self._model.generate_content(
-                    [prompt] + imgs,
-                    generation_config=generation_config)  # type: ignore
-                break
-            except google.api_core.exceptions.ResourceExhausted:
-                # In this case, we've hit a rate limit. Simply wait 3s and
-                # try again.
-                logging.debug(
-                    "Hit rate limit for Gemini queries; trying again in 3s!")
-                time.sleep(3.0)
+        response = self._model.generate_content(
+            [prompt] + imgs,
+            generation_config=generation_config)  # type: ignore
         response.resolve()
         return [response.text]
+
+
+class OpenAIVLM(VisionLanguageModel, OpenAIModel):
+    """Interface for OpenAI's VLMs, including GPT-4 Turbo (and preview
+    versions)."""
+
+    def __init__(self, model_name: str):
+        """Initialize with a specific model name."""
+        self.model_name = model_name
+        # Note that max_tokens is the maximum response length (not prompt).
+        # From OpenAI docs: "The token count of your prompt plus max_tokens
+        # cannot exceed the model's context length."
+        self._max_tokens = CFG.llm_openai_max_response_tokens
+        self.set_openai_key()
+
+    def prepare_vision_messages(
+        self,
+        images: List[PIL.Image.Image],
+        prefix: Optional[str] = None,
+        suffix: Optional[str] = None,
+        image_size: Optional[int] = 512,
+        detail: str = "auto"
+    ) -> List[Dict[str, Union[str, List[Dict[str, str]], List[Dict[
+            str, Collection[str]]]]]]:
+        """Prepare text and image messages for the OpenAI API."""
+        content: List[Dict[str, Union[str, Collection[str]]]] = []
+        if prefix:
+            content.append({"text": prefix, "type": "text"})
+        assert images
+        assert detail in ["auto", "low", "high"]
+        for img in images:
+            img_resized = img
+            if image_size:
+                factor = image_size / max(img.size)
+                img_resized = img.resize(
+                    (int(img.size[0] * factor), int(img.size[1] * factor)))
+            # Convert the image to PNG format and encode it in base64
+            buffer = BytesIO()
+            img_resized.save(buffer, format="PNG")
+            buf = buffer.getvalue()
+            frame = base64.b64encode(buf).decode("utf-8")
+            content_str = {
+                "image_url": {
+                    "url": f"data:image/png;base64,{frame}",
+                    "detail": "auto"
+                },
+                "type": "image_url"
+            }
+            content.append(content_str)
+        if suffix:
+            content.append({"text": suffix, "type": "text"})
+        return [{"role": "user", "content": content}]
+
+    def get_id(self) -> str:
+        """Get an identifier for the model."""
+        return f"OpenAI-{self.model_name}"
+
+    def _sample_completions(
+        self,
+        prompt: str,
+        imgs: Optional[List[PIL.Image.Image]],
+        temperature: float,
+        seed: int,
+        stop_token: Optional[str] = None,
+        num_completions: int = 1,
+    ) -> List[str]:  # pragma: no cover
+        """Query the model and get responses."""
+        del seed, stop_token  # unused.
+        if imgs is None:
+            raise ValueError("images cannot be None")
+        messages = self.prepare_vision_messages(prefix=prompt,
+                                                images=imgs,
+                                                detail="auto")
+        responses = [
+            self.call_openai_api(messages,
+                                 model=self.model_name,
+                                 max_tokens=self._max_tokens,
+                                 temperature=temperature)
+            for _ in range(num_completions)
+        ]
+        return responses
diff --git a/predicators/settings.py b/predicators/settings.py
index 02348dace7..74f233e513 100644
--- a/predicators/settings.py
+++ b/predicators/settings.py
@@ -351,6 +351,10 @@ class GlobalSettings:
     # grid row env parameters
     grid_row_num_cells = 100
 
+    # burger env parameters
+    gridworld_num_rows = 4
+    gridworld_num_cols = 4
+
     # parameters for random options approach
     random_options_max_tries = 100
 
@@ -409,7 +413,8 @@ class GlobalSettings:
     override_json_with_input = False  # Only works with SpotEnv for now
 
     # parameters for vision language models
-    vlm_model_name = "gemini-pro-vision"  # "gemini-1.5-pro-latest"
+    # gemini-1.5-pro-latest, gpt-4-turbo, gpt-4o
+    vlm_model_name = "gemini-pro-vision"
 
     # SeSamE parameters
     sesame_task_planner = "astar"  # "astar" or "fdopt" or "fdsat"
@@ -445,6 +450,7 @@ class GlobalSettings:
     approach_dir = "saved_approaches"
     data_dir = "saved_datasets"
     video_dir = "videos"
+    image_dir = "images"
     video_fps = 2
     failure_video_mode = "longest_only"
 
@@ -486,6 +492,12 @@ class GlobalSettings:
     cluster_and_search_score_func_max_groundings = 10000
     cluster_and_search_var_count_weight = 0.1
     cluster_and_search_precon_size_weight = 0.01
+    cluster_and_intersect_prune_low_data_pnads = False
+    # If cluster_and_intersect_prune_low_data_pnads is set to True, PNADs must
+    # have at least this fraction of the segments produced by the option that is
+    # associated with their PNAD in order to not be pruned during operator
+    # learning.
+    cluster_and_intersect_min_datastore_fraction = 0.0
 
     # torch GPU usage setting
     use_torch_gpu = False
@@ -669,7 +681,7 @@ class GlobalSettings:
     # demo+labelled_atoms
     handmade_demo_filename = ""
     # filepath to be used if offline_data_method is set to
-    # img_demos
+    # saved_vlm_img_demos_folder
     vlm_trajs_folder_name = ""
 
     @classmethod
diff --git a/predicators/structs.py b/predicators/structs.py
index 4fd65ecf09..046ad188a4 100644
--- a/predicators/structs.py
+++ b/predicators/structs.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import abc
+import copy
 import itertools
 from dataclasses import dataclass, field
 from functools import cached_property, lru_cache
@@ -162,7 +163,8 @@ def copy(self) -> State:
         new_data = {}
         for obj in self:
             new_data[obj] = self._copy_state_value(self.data[obj])
-        return State(new_data, simulator_state=self.simulator_state)
+        return State(new_data,
+                     simulator_state=copy.deepcopy(self.simulator_state))
 
     def _copy_state_value(self, val: Any) -> Any:
         if val is None or isinstance(val, (float, bool, int, str)):
@@ -214,7 +216,7 @@ def pretty_str(self) -> str:
 DefaultState = State({})
 
 
-@dataclass(frozen=True, order=True, repr=False)
+@dataclass(frozen=True, order=False, repr=False)
 class Predicate:
     """Struct defining a predicate (a lifted classifier over states)."""
     name: str
@@ -306,6 +308,21 @@ def _negated_classifier(self, state: State,
         # Separate this into a named function for pickling reasons.
         return not self._classifier(state, objects)
 
+    def __lt__(self, other: Predicate) -> bool:
+        return str(self) < str(other)
+
+
+@dataclass(frozen=True, order=False, repr=False, eq=False)
+class VLMPredicate(Predicate):
+    """Struct defining a predicate that calls a VLM as part of returning its
+    truth value.
+
+    NOTE: when instantiating a VLMPredicate, we typically pass in a 'Dummy'
+    classifier (i.e., one that returns simply raises some kind of error instead
+    of actually outputting a value of any kind).
+    """
+    get_vlm_query_str: Callable[[Sequence[Object]], str]
+
 
 @dataclass(frozen=True, repr=False, eq=False)
 class _Atom:
@@ -413,6 +430,12 @@ def holds(self, state: State) -> bool:
         """Check whether this ground atom holds in the given state."""
         return self.predicate.holds(state, self.objects)
 
+    def get_vlm_query_str(self) -> str:
+        """If this GroundAtom is associated with a VLMPredicate, then get the
+        string that will be used to query the VLM."""
+        assert isinstance(self.predicate, VLMPredicate)
+        return self.predicate.get_vlm_query_str(self.objects)  # pylint:disable=no-member
+
 
 @dataclass(frozen=True, eq=False)
 class Task:
@@ -1129,9 +1152,9 @@ def train_task_idx(self) -> int:
 
 @dataclass(frozen=True, repr=False, eq=False)
 class ImageOptionTrajectory:
-    """A structure similar to a LowLevelTrajectory where instead of low-level
-    states and actions, we record images at every state (i.e., observations),
-    as well as the option that was executed to get between observation images.
+    """A structure similar to a LowLevelTrajectory where we record images at
+    every state (i.e., observations), as well as the option that was executed
+    to get between observation images. States are optionally included too.
 
     Invariant 1: If this trajectory is a demonstration, it must contain
     a train task idx and achieve the goal in the respective train task.
@@ -1143,6 +1166,7 @@ class ImageOptionTrajectory:
     _objects: Collection[Object]
     _state_imgs: List[List[PIL.Image.Image]]
     _actions: List[_Option]
+    _states: Optional[List[State]] = field(default=None)
     _is_demo: bool = field(default=False)
     _train_task_idx: Optional[int] = field(default=None)
 
@@ -1150,10 +1174,12 @@ def __post_init__(self) -> None:
         assert len(self._state_imgs) == len(self._actions) + 1
         if self._is_demo:
             assert self._train_task_idx is not None
+        if self._states is not None:
+            assert len(self._states) == len(self._state_imgs)
 
     @property
     def imgs(self) -> List[List[PIL.Image.Image]]:
-        """States in the trajectory."""
+        """State images in the trajectory."""
         return self._state_imgs
 
     @property
@@ -1166,6 +1192,16 @@ def actions(self) -> List[_Option]:
         """Actions in the trajectory."""
         return self._actions
 
+    @property
+    def states(self) -> Optional[List[State]]:
+        """States in the trajectory, if they exist."""
+        return self._states
+
+    @property
+    def train_task_idx(self) -> Optional[int]:
+        """Returns the idx of the train task."""
+        return self._train_task_idx
+
 
 @dataclass(repr=False, eq=False)
 class Dataset:
diff --git a/predicators/utils.py b/predicators/utils.py
index 7293848f78..df811f2d49 100644
--- a/predicators/utils.py
+++ b/predicators/utils.py
@@ -43,6 +43,7 @@
 import numpy as np
 import pathos.multiprocessing as mp
 from bosdyn.client import math_helpers
+import PIL.Image
 from gym.spaces import Box
 from matplotlib import patches
 from numpy.typing import NDArray
@@ -52,6 +53,8 @@
 from scipy.stats import beta as BetaRV
 
 from predicators.args import create_arg_parser
+from predicators.pretrained_model_interface import GoogleGeminiVLM, \
+    OpenAIVLM, VisionLanguageModel
 from predicators.pybullet_helpers.joint import JointPositions
 from predicators.settings import CFG, GlobalSettings
 from predicators.structs import NSRT, Action, Array, DummyOption, \
@@ -60,8 +63,8 @@
     LiftedDecisionList, LiftedOrGroundAtom, LowLevelTrajectory, Metrics, \
     NSRTOrSTRIPSOperator, Object, ObjectOrVariable, Observation, OptionSpec, \
     ParameterizedOption, Predicate, Segment, SpotAction, SpotActionExtraInfo, \
-    State, STRIPSOperator, Task, Type, Variable, VarToObjSub, Video, \
-    _GroundLDLRule, _GroundNSRT, _GroundSTRIPSOperator, _Option, \
+    State, STRIPSOperator, Task, Type, Variable, VarToObjSub, VLMPredicate, \
+    Video, _GroundLDLRule, _GroundNSRT, _GroundSTRIPSOperator, _Option, \
     _TypedEntity
 from predicators.third_party.fast_downward_translator.translate import \
     main as downward_translate
@@ -1229,7 +1232,9 @@ def run_policy(
     last action from the returned trajectory to maintain the invariant that
     the trajectory states are of length one greater than the actions.
 
-    NOTE: this may be deprecated in the future in favor of run_episode.
+    NOTE: this may be deprecated in the future in favor of run_episode defined
+    in cogman.py. Ideally, we should consolidate both run_policy and
+    run_policy_with_simulator below into run_episode.
     """
     if do_env_reset:
         env.reset(train_or_test, task_idx)
@@ -2331,17 +2336,103 @@ def strip_task(task: Task, included_predicates: Set[Predicate]) -> Task:
     return Task(task.init, stripped_goal)
 
 
-def abstract(state: State, preds: Collection[Predicate]) -> Set[GroundAtom]:
+def create_vlm_predicate(
+        name: str, types: Sequence[Type],
+        get_vlm_query_str: Callable[[Sequence[Object]], str]) -> VLMPredicate:
+    """Simple function that creates VLMPredicates with dummy classifiers, which
+    is the most-common way these need to be created."""
+
+    def _stripped_classifier(
+            state: State,
+            objects: Sequence[Object]) -> bool:  # pragma: no cover.
+        raise Exception("VLM predicate classifier should never be called!")
+
+    return VLMPredicate(name, types, _stripped_classifier, get_vlm_query_str)
+
+
+def create_vlm_by_name(
+        model_name: str) -> VisionLanguageModel:  # pragma: no cover
+    """Create particular vlm using a provided name."""
+    if "gemini" in model_name:
+        return GoogleGeminiVLM(model_name)
+    return OpenAIVLM(model_name)
+
+
+def query_vlm_for_atom_vals(
+        vlm_atoms: Collection[GroundAtom],
+        state: State,
+        vlm: Optional[VisionLanguageModel] = None) -> Set[GroundAtom]:
+    """Given a set of ground atoms, queries a VLM and gets the subset of these
+    atoms that are true."""
+    true_atoms: Set[GroundAtom] = set()
+    # This only works if state.simulator_state is some list of images that the
+    # vlm can be called on.
+    assert state.simulator_state is not None
+    assert isinstance(state.simulator_state, List)
+    imgs = state.simulator_state
+    vlm_atoms = sorted(vlm_atoms)
+    atom_queries_str = "\n* "
+    atom_queries_str += "\n* ".join(atom.get_vlm_query_str()
+                                    for atom in vlm_atoms)
+    filepath_to_vlm_prompt = get_path_to_predicators_root() + \
+        "/predicators/datasets/vlm_input_data_prompts/atom_labelling/" + \
+        "per_scene_naive.txt"
+    with open(filepath_to_vlm_prompt, "r", encoding="utf-8") as f:
+        vlm_query_str = f.read()
+    vlm_query_str += atom_queries_str
+    if vlm is None:
+        vlm = create_vlm_by_name(CFG.vlm_model_name)  # pragma: no cover.
+    vlm_input_imgs = \
+        [PIL.Image.fromarray(img_arr) for img_arr in imgs] # type: ignore
+    vlm_output = vlm.sample_completions(vlm_query_str,
+                                        vlm_input_imgs,
+                                        0.0,
+                                        seed=CFG.seed,
+                                        num_completions=1)
+    assert len(vlm_output) == 1
+    vlm_output_str = vlm_output[0]
+    all_atom_queries = atom_queries_str.strip().split("\n")
+    all_vlm_responses = vlm_output_str.strip().split("\n")
+
+    # NOTE: this assumption is likely too brittle; if this is breaking, feel
+    # free to remove/adjust this and change the below parsing loop accordingly!
+    assert len(all_atom_queries) == len(all_vlm_responses)
+    for i, (atom_query, curr_vlm_output_line) in enumerate(
+            zip(all_atom_queries, all_vlm_responses)):
+        assert atom_query + ":" in curr_vlm_output_line
+        assert "." in curr_vlm_output_line
+        period_idx = curr_vlm_output_line.find(".")
+        if curr_vlm_output_line[len(atom_query +
+                                    ":"):period_idx].lower().strip() == "true":
+            true_atoms.add(vlm_atoms[i])
+    return true_atoms
+
+
+def abstract(state: State,
+             preds: Collection[Predicate],
+             vlm: Optional[VisionLanguageModel] = None) -> Set[GroundAtom]:
     """Get the atomic representation of the given state (i.e., a set of ground
     atoms), using the given set of predicates.
 
     Duplicate arguments in predicates are allowed.
     """
+    # Start by pulling out all VLM predicates.
+    vlm_preds = set(pred for pred in preds if isinstance(pred, VLMPredicate))
+    # Next, classify all non-VLM predicates.
     atoms = set()
     for pred in preds:
-        for choice in get_object_combinations(list(state), pred.types):
-            if pred.holds(state, choice):
-                atoms.add(GroundAtom(pred, choice))
+        if pred not in vlm_preds:
+            for choice in get_object_combinations(list(state), pred.types):
+                if pred.holds(state, choice):
+                    atoms.add(GroundAtom(pred, choice))
+    if len(vlm_preds) > 0:
+        # Now, aggregate all the VLM predicates and make a single call to a
+        # VLM to get their values.
+        vlm_atoms = set()
+        for pred in vlm_preds:
+            for choice in get_object_combinations(list(state), pred.types):
+                vlm_atoms.add(GroundAtom(pred, choice))
+        atoms |= query_vlm_for_atom_vals(vlm_atoms, state, vlm)
     return atoms
 
 
@@ -3300,6 +3391,19 @@ def save_video(outfile: str, video: Video) -> None:
     logging.info(f"Wrote out to {outpath}")
 
 
+def save_images(outfile_prefix: str, video: Video) -> None:
+    """Save the video as individual images to image_dir."""
+    outdir = CFG.image_dir
+    os.makedirs(outdir, exist_ok=True)
+    width = len(str(len(video)))
+    for i, image in enumerate(video):
+        image_number = str(i).zfill(width)
+        outfile = outfile_prefix + f"_image_{image_number}.png"
+        outpath = os.path.join(outdir, outfile)
+        imageio.imwrite(outpath, image)
+        logging.info(f"Wrote out to {outpath}")
+
+
 def get_env_asset_path(asset_name: str, assert_exists: bool = True) -> str:
     """Return the absolute path to env asset."""
     dir_path = os.path.dirname(os.path.realpath(__file__))
diff --git a/scripts/grammar_search_analysis.py b/scripts/grammar_search_analysis.py
index 4c010ce3e8..7e388912ef 100644
--- a/scripts/grammar_search_analysis.py
+++ b/scripts/grammar_search_analysis.py
@@ -183,7 +183,8 @@ def _run_proxy_analysis_for_env(args: Dict[str, Any], env_name: str,
     options = get_gt_options(env.get_name())
     env_train_tasks = env.get_train_tasks()
     train_tasks = [t.task for t in env_train_tasks]
-    dataset = create_dataset(env, train_tasks, options)
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, options, predicates)
     start_time = time.perf_counter()
 
     for non_goal_predicates in non_goal_predicate_sets:
diff --git a/scripts/skeleton_score_analysis.py b/scripts/skeleton_score_analysis.py
index 5efbd22c9e..f673c11b58 100644
--- a/scripts/skeleton_score_analysis.py
+++ b/scripts/skeleton_score_analysis.py
@@ -56,9 +56,10 @@ def _setup_data_for_env(env_name: str,
     })
     env = create_new_env(env_name)
     options = get_gt_options(env.get_name())
+    predicates, _ = utils.parse_config_excluded_predicates(env)
     env_train_tasks = env.get_train_tasks()
     train_tasks = [t.task for t in env_train_tasks]
-    dataset = create_dataset(env, train_tasks, options)
+    dataset = create_dataset(env, train_tasks, options, predicates)
     assert all(traj.is_demo for traj in dataset.trajectories)
     demo_skeleton_lengths = [
         utils.num_options_in_action_sequence(t.actions)
diff --git a/setup.py b/setup.py
index e35789bffb..becc78a59d 100644
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@
         "pybullet>=3.2.0",
         "scikit-learn==1.1.2",
         "graphlib-backport",
-        "openai==0.28.1",
+        "openai==1.19.0",
         "pyyaml==6.0",
         "pylint==2.14.5",
         "types-PyYAML",
@@ -44,7 +44,8 @@
         "gym_sokoban@git+https://github.com/Learning-and-Intelligent-Systems/gym-sokoban.git",  # pylint: disable=line-too-long
         "pbrspot@git+https://github.com/NishanthJKumar/pbrspot.git",
         "ImageHash",
-        "google-generativeai"
+        "google-generativeai",
+        "tenacity",
     ],
     include_package_data=True,
     extras_require={
diff --git a/tests/approaches/test_active_sampler_learning_approach.py b/tests/approaches/test_active_sampler_learning_approach.py
index 237c0db4a4..2743eda5fa 100644
--- a/tests/approaches/test_active_sampler_learning_approach.py
+++ b/tests/approaches/test_active_sampler_learning_approach.py
@@ -86,7 +86,9 @@ def test_active_sampler_learning_approach(model_name, right_targets, num_demo,
     approach = ActiveSamplerLearningApproach(env.predicates, options,
                                              env.types, env.action_space,
                                              train_tasks)
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     assert approach.is_learning_based
     # Learning with an empty dataset should not crash.
     approach.learn_from_offline_dataset(Dataset([]))
diff --git a/tests/approaches/test_gnn_action_policy_approach.py b/tests/approaches/test_gnn_action_policy_approach.py
index bcf644f5ea..e07cb38c3b 100644
--- a/tests/approaches/test_gnn_action_policy_approach.py
+++ b/tests/approaches/test_gnn_action_policy_approach.py
@@ -26,7 +26,9 @@ def test_gnn_action_policy_approach():
     approach = create_approach("gnn_action_policy", env.predicates,
                                get_gt_options(env.get_name()), env.types,
                                env.action_space, train_tasks)
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     assert approach.is_learning_based
     task = env.get_test_tasks()[0].task
     with pytest.raises(AssertionError):  # haven't learned yet!
diff --git a/tests/approaches/test_gnn_metacontroller_approach.py b/tests/approaches/test_gnn_metacontroller_approach.py
index 019e9b5d20..7b0ccd1f41 100644
--- a/tests/approaches/test_gnn_metacontroller_approach.py
+++ b/tests/approaches/test_gnn_metacontroller_approach.py
@@ -62,7 +62,9 @@ def test_gnn_metacontroller_approach_with_envs(env_name, num_epochs):
     approach = create_approach("gnn_metacontroller", env.predicates,
                                get_gt_options(env.get_name()), env.types,
                                env.action_space, train_tasks)
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     assert approach.is_learning_based
     task = env.get_test_tasks()[0]
     approach.learn_from_offline_dataset(dataset)
diff --git a/tests/approaches/test_gnn_option_policy_approach.py b/tests/approaches/test_gnn_option_policy_approach.py
index d0bf4dbe01..781358170a 100644
--- a/tests/approaches/test_gnn_option_policy_approach.py
+++ b/tests/approaches/test_gnn_option_policy_approach.py
@@ -60,7 +60,9 @@ def test_gnn_option_policy_approach_with_envs(env_name):
     approach = create_approach("gnn_option_policy", env.predicates,
                                get_gt_options(env.get_name()), env.types,
                                env.action_space, train_tasks)
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     assert approach.is_learning_based
     task = env.get_test_tasks()[0].task
     with pytest.raises(AssertionError):  # haven't learned yet!
diff --git a/tests/approaches/test_grammar_search_invention_approach.py b/tests/approaches/test_grammar_search_invention_approach.py
index 11ff701eb6..1c1bdba500 100644
--- a/tests/approaches/test_grammar_search_invention_approach.py
+++ b/tests/approaches/test_grammar_search_invention_approach.py
@@ -15,8 +15,10 @@
     _halving_constant_generator, _NegationClassifier, _PredicateGrammar, \
     _SingleAttributeCompareClassifier, \
     _SingleFeatureInequalitiesPredicateGrammar, _UnaryFreeForallClassifier
+from predicators.datasets import create_dataset
 from predicators.envs.cover import CoverEnv
 from predicators.envs.stick_button import StickButtonMovementEnv
+from predicators.envs.vlm_envs import IceTeaMakingEnv
 from predicators.ground_truth_models import get_gt_options
 from predicators.settings import CFG
 from predicators.structs import Action, Dataset, LowLevelTrajectory, Object, \
@@ -165,6 +167,40 @@ def test_labelled_atoms_invention():
         approach.learn_from_offline_dataset(dataset)
 
 
+def test_invention_from_txt_file():
+    """Test loading a dataset from a txt file."""
+    utils.reset_config({
+        "env":
+        "ice_tea_making",
+        "num_train_tasks":
+        1,
+        "num_test_tasks":
+        0,
+        "offline_data_method":
+        "demo+labelled_atoms",
+        "data_dir":
+        "tests/datasets/mock_vlm_datasets",
+        "handmade_demo_filename":
+        "ice_tea_making__demo+labelled_atoms__manual__1.txt"
+    })
+    env = IceTeaMakingEnv()
+    train_tasks = env.get_train_tasks()
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    loaded_dataset = create_dataset(env, train_tasks,
+                                    get_gt_options(env.get_name()), predicates)
+    approach = GrammarSearchInventionApproach(env.goal_predicates,
+                                              get_gt_options(env.get_name()),
+                                              env.types, env.action_space,
+                                              train_tasks)
+    approach.learn_from_offline_dataset(loaded_dataset)
+    # The ice_tea_making__demo+labelled_atoms__manual__1.txt happens to
+    # set all atoms to True at all timesteps, and so we expect predicate
+    # invention to not select any of the predicates (only select the goal)
+    # predicates.
+    assert len(approach._get_current_predicates()) == 1  # pylint:disable=protected-access
+    assert approach._get_current_predicates() == env.goal_predicates  # pylint:disable=protected-access
+
+
 def test_euclidean_grammar():
     """Tests for the EuclideanGrammar."""
     utils.reset_config({"env": "stick_button_move"})
diff --git a/tests/approaches/test_interactive_approach.py b/tests/approaches/test_interactive_approach.py
index e52388bf12..97f612a2a9 100644
--- a/tests/approaches/test_interactive_approach.py
+++ b/tests/approaches/test_interactive_approach.py
@@ -62,7 +62,9 @@ def test_interactive_learning_approach(predicate_classifier_model,
                                            env.types, env.action_space,
                                            stripped_train_tasks)
     teacher = Teacher(train_tasks)
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     assert approach.is_learning_based
     # Learning with an empty dataset should not crash.
     approach.learn_from_offline_dataset(Dataset([]))
@@ -239,4 +241,5 @@ def _policy(s: State, memory: Dict, objects: Sequence[Object],
         "teacher_dataset_num_examples": 0,
     })
     with pytest.raises(AssertionError):
-        create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+        create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                       predicates)
diff --git a/tests/approaches/test_llm_bilevel_planning_approach.py b/tests/approaches/test_llm_bilevel_planning_approach.py
index 33e304adf2..d41b8be20a 100644
--- a/tests/approaches/test_llm_bilevel_planning_approach.py
+++ b/tests/approaches/test_llm_bilevel_planning_approach.py
@@ -31,7 +31,9 @@ def test_llm_bilevel_planning_approach():
                                           train_tasks)
     assert approach.get_name() == "llm_bilevel_planning"
     # Test "learning", i.e., constructing the prompt prefix.
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     assert not approach._prompt_prefix  # pylint: disable=protected-access
     approach.learn_from_offline_dataset(dataset)
     assert approach._prompt_prefix  # pylint: disable=protected-access
diff --git a/tests/approaches/test_llm_open_loop_approach.py b/tests/approaches/test_llm_open_loop_approach.py
index 9ff484e330..efbf257867 100644
--- a/tests/approaches/test_llm_open_loop_approach.py
+++ b/tests/approaches/test_llm_open_loop_approach.py
@@ -34,7 +34,9 @@ def test_llm_open_loop_approach():
                                    env.action_space, train_tasks)
     assert approach.get_name() == "llm_open_loop"
     # Test "learning", i.e., constructing the prompt prefix.
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     assert not approach._prompt_prefix  # pylint: disable=protected-access
     approach.learn_from_offline_dataset(dataset)
     assert approach._prompt_prefix  # pylint: disable=protected-access
diff --git a/tests/approaches/test_nsrt_learning_approach.py b/tests/approaches/test_nsrt_learning_approach.py
index 6050cd2915..9c27cfea39 100644
--- a/tests/approaches/test_nsrt_learning_approach.py
+++ b/tests/approaches/test_nsrt_learning_approach.py
@@ -74,7 +74,8 @@ def _test_approach(env_name,
         options = parse_config_included_options(env)
     approach = create_approach(approach_name, preds, options, env.types,
                                env.action_space, train_tasks)
-    dataset = create_dataset(env, train_tasks, options)
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, options, predicates)
     assert approach.is_learning_based
     approach.learn_from_offline_dataset(dataset)
     task = env.get_test_tasks()[0].task
diff --git a/tests/approaches/test_nsrt_rl_approach.py b/tests/approaches/test_nsrt_rl_approach.py
index 0d7a4443cd..ada9596347 100644
--- a/tests/approaches/test_nsrt_rl_approach.py
+++ b/tests/approaches/test_nsrt_rl_approach.py
@@ -63,7 +63,8 @@ def test_nsrt_reinforcement_learning_approach(nsrt_rl_reward_epsilon):
     perceiver = create_perceiver("trivial")
     exec_monitor = create_execution_monitor("trivial")
     cogman = CogMan(approach, perceiver, exec_monitor)
-    dataset = create_dataset(env, train_tasks, {})
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, {}, predicates)
     assert approach.is_learning_based
     approach.learn_from_offline_dataset(dataset)
     interaction_requests = approach.get_interaction_requests()
diff --git a/tests/approaches/test_online_nsrt_learning_approach.py b/tests/approaches/test_online_nsrt_learning_approach.py
index 1361a9a2ce..d84dfb1d2e 100644
--- a/tests/approaches/test_online_nsrt_learning_approach.py
+++ b/tests/approaches/test_online_nsrt_learning_approach.py
@@ -39,7 +39,9 @@ def test_online_nsrt_learning_approach():
                                           get_gt_options(env.get_name()),
                                           env.types, env.action_space,
                                           train_tasks)
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     assert approach.is_learning_based
     # Learning with an empty dataset should not crash.
     approach.learn_from_offline_dataset(Dataset([]))
diff --git a/tests/approaches/test_online_pg3_approach.py b/tests/approaches/test_online_pg3_approach.py
index fbec90fac1..eec158cdd5 100644
--- a/tests/approaches/test_online_pg3_approach.py
+++ b/tests/approaches/test_online_pg3_approach.py
@@ -38,7 +38,9 @@ def test_online_pg3_approach():
     approach = OnlinePG3Approach(env.predicates,
                                  get_gt_options(env.get_name()), env.types,
                                  env.action_space, train_tasks)
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     assert approach.is_learning_based
     # Learning with an empty dataset should not crash.
     approach.learn_from_offline_dataset(Dataset([]))
diff --git a/tests/approaches/test_pg3_approach.py b/tests/approaches/test_pg3_approach.py
index de5e70bbdc..171faace4a 100644
--- a/tests/approaches/test_pg3_approach.py
+++ b/tests/approaches/test_pg3_approach.py
@@ -151,7 +151,9 @@ def test_pg3_approach(approach_name, approach_cls):
         option = act.get_option()
         assert option.name == "pick-up"
     # Test learning with a fast heuristic.
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     approach.learn_from_offline_dataset(dataset)
     load_path = utils.get_approach_load_path_str()
     expected_policy_file = f"{load_path}_None.ldl"
@@ -190,7 +192,8 @@ def test_cluttered_table_pg3_approach():
     train_tasks = [t.task for t in env.get_train_tasks()]
     approach = PG3Approach(env.predicates, get_gt_options(env.get_name()),
                            env.types, env.action_space, train_tasks)
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             env.predicates)
     approach.learn_from_offline_dataset(dataset)
     # Test several tasks to make sure we encounter at least one discovered
     # failure.
diff --git a/tests/datasets/mock_vlm_datasets/cover__demo+labelled_atoms__manual__1.txt b/tests/datasets/mock_vlm_datasets/cover__demo+labelled_atoms__manual__1.txt
new file mode 100644
index 0000000000..303c1504a8
--- /dev/null
+++ b/tests/datasets/mock_vlm_datasets/cover__demo+labelled_atoms__manual__1.txt
@@ -0,0 +1,14 @@
+===
+{*IsRobot(robby): True
+} ->
+
+PickPlace()[0.6926419734954834] -> 
+
+{*IsRobot(robby): True
+} ->
+
+PickPlace()[0.6926419734954834] -> 
+
+{*IsRobot(robby): True
+}
+===
diff --git a/tests/datasets/mock_vlm_datasets/cover__vlm_demos__456__1/traj_0/0/img.jpg b/tests/datasets/mock_vlm_datasets/cover__vlm_demos__456__1/traj_0/0/img.jpg
new file mode 100644
index 0000000000..ea83f3e6ad
Binary files /dev/null and b/tests/datasets/mock_vlm_datasets/cover__vlm_demos__456__1/traj_0/0/img.jpg differ
diff --git a/tests/datasets/mock_vlm_datasets/cover__vlm_demos__456__1/traj_0/0/state.p b/tests/datasets/mock_vlm_datasets/cover__vlm_demos__456__1/traj_0/0/state.p
new file mode 100644
index 0000000000..ff9446d1a5
Binary files /dev/null and b/tests/datasets/mock_vlm_datasets/cover__vlm_demos__456__1/traj_0/0/state.p differ
diff --git a/tests/datasets/mock_vlm_datasets/cover__vlm_demos__456__1/traj_0/1/img.jpg b/tests/datasets/mock_vlm_datasets/cover__vlm_demos__456__1/traj_0/1/img.jpg
new file mode 100644
index 0000000000..ea83f3e6ad
Binary files /dev/null and b/tests/datasets/mock_vlm_datasets/cover__vlm_demos__456__1/traj_0/1/img.jpg differ
diff --git a/tests/datasets/mock_vlm_datasets/cover__vlm_demos__456__1/traj_0/1/state.p b/tests/datasets/mock_vlm_datasets/cover__vlm_demos__456__1/traj_0/1/state.p
new file mode 100644
index 0000000000..ff9446d1a5
Binary files /dev/null and b/tests/datasets/mock_vlm_datasets/cover__vlm_demos__456__1/traj_0/1/state.p differ
diff --git a/tests/datasets/mock_vlm_datasets/cover__vlm_demos__456__1/traj_0/2/img.jpg b/tests/datasets/mock_vlm_datasets/cover__vlm_demos__456__1/traj_0/2/img.jpg
new file mode 100644
index 0000000000..ea83f3e6ad
Binary files /dev/null and b/tests/datasets/mock_vlm_datasets/cover__vlm_demos__456__1/traj_0/2/img.jpg differ
diff --git a/tests/datasets/mock_vlm_datasets/cover__vlm_demos__456__1/traj_0/2/state.p b/tests/datasets/mock_vlm_datasets/cover__vlm_demos__456__1/traj_0/2/state.p
new file mode 100644
index 0000000000..ff9446d1a5
Binary files /dev/null and b/tests/datasets/mock_vlm_datasets/cover__vlm_demos__456__1/traj_0/2/state.p differ
diff --git a/tests/datasets/mock_vlm_datasets/cover__vlm_demos__456__1/traj_0/options_traj.txt b/tests/datasets/mock_vlm_datasets/cover__vlm_demos__456__1/traj_0/options_traj.txt
new file mode 100644
index 0000000000..2867f6b6c2
--- /dev/null
+++ b/tests/datasets/mock_vlm_datasets/cover__vlm_demos__456__1/traj_0/options_traj.txt
@@ -0,0 +1,2 @@
+PickPlace()[0.6926419734954834]
+PickPlace()[0.6926419734954834]
diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py
index 98895a59a5..e2118d6add 100644
--- a/tests/datasets/test_datasets.py
+++ b/tests/datasets/test_datasets.py
@@ -3,12 +3,14 @@
 import shutil
 from contextlib import nullcontext as does_not_raise
 
+import numpy as np
 import pytest
 
 from predicators import utils
 from predicators.datasets import create_dataset
 from predicators.datasets.generate_atom_trajs_with_vlm import \
-    create_ground_atom_data_from_img_trajs
+    create_ground_atom_data_from_generated_demos, \
+    create_ground_atom_data_from_saved_img_trajs
 from predicators.envs.blocks import BlocksEnv
 from predicators.envs.cluttered_table import ClutteredTableEnv
 from predicators.envs.cover import CoverEnv, CoverMultistepOptions
@@ -38,13 +40,15 @@ def _sample_completions(self,
             # If the query is asking for atom proposals.
             if "Please provide predicates" in prompt:
                 completion = "*Holding(spoon)\n*Fizz(buzz)\n" + \
-                    "Submerged(teabag)\nSubmerged(spoon)"
+                    "*Submerged(teabag)\n*Submerged(spoon)\n*IsRobot(robby)"
             # Else, if the query is asking for particular values.
             elif "values of the following predicates" in prompt:
                 # Completion for default predicates.
                 if "Submerged" in prompt:
                     completion = "*Holding(spoon): True.\n" + \
                         "*Submerged(teabag): False.\n*Submerged(spoon): False."
+                elif "IsRobot" in prompt:
+                    completion = "*IsRobot(robby): True\n"
                 # Completion for debug predicates
                 else:
                     completion = ("hand_grasping_spoon(hand, spoon): True.\n"
@@ -73,7 +77,8 @@ def test_demo_dataset():
     env = CoverEnv()
     train_tasks = [t.task for t in env.get_train_tasks()]
     options = parse_config_included_options(env)
-    dataset = create_dataset(env, train_tasks, options)
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, options, predicates)
     assert len(dataset.trajectories) == 7
     assert len(dataset.trajectories[0].states) == 3
     assert len(dataset.trajectories[0].actions) == 2
@@ -92,7 +97,9 @@ def test_demo_dataset():
     })
     env = CoverEnv()
     train_tasks = [t.task for t in env.get_train_tasks()]
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     assert len(dataset.trajectories) == 7
     assert len(dataset.trajectories[0].states) == 3
     assert len(dataset.trajectories[0].actions) == 2
@@ -121,7 +128,8 @@ def test_demo_dataset():
     train_tasks = [t.task for t in env.get_train_tasks()]
     options = parse_config_included_options(env)
     assert options == {Pick}
-    dataset = create_dataset(env, train_tasks, options)
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, options, predicates)
     assert len(dataset.trajectories) == 3
     at_least_one_pick_found = False
     at_least_one_place_found = False
@@ -152,7 +160,9 @@ def test_demo_dataset():
     Holding = [pred for pred in env.predicates if pred.name == "Holding"][0]
     imposs_goal = {GroundAtom(HandEmpty, []), Holding([list(init)[0]])}
     train_tasks[0] = Task(init, imposs_goal)
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     assert len(dataset.trajectories) < 7
     # Test max_initial_demos.
     utils.reset_config({
@@ -164,43 +174,56 @@ def test_demo_dataset():
     env = CoverEnv()
     train_tasks = [t.task for t in env.get_train_tasks()]
     assert len(train_tasks) == 7
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     assert len(dataset.trajectories) == 3
     utils.update_config({
         "offline_data_method": "not a real method",
     })
     with pytest.raises(NotImplementedError):
-        create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+        create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                       predicates)
     utils.update_config({
         "offline_data_method":
         "demo",
         "offline_data_task_planning_heuristic":
         "not a real heuristic",
     })
+    predicates, _ = utils.parse_config_excluded_predicates(env)
     with pytest.raises(ValueError):
-        create_dataset(env, train_tasks, get_gt_options(env.get_name()))
-    # Test demo video generation.
+        create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                       predicates)
+    # Test demo video and image generation.
     video_dir = os.path.join(os.path.dirname(__file__), "_fake_videos")
+    image_dir = os.path.join(os.path.dirname(__file__), "_fake_images")
     utils.reset_config({
         "env": "cover",
         "offline_data_method": "demo",
         "num_train_tasks": 1,
         "make_demo_videos": True,
+        "make_demo_images": True,
         "cover_num_blocks": 1,
         "cover_num_targets": 1,
         "cover_block_widths": [0.1],
         "cover_target_widths": [0.05],
         "cover_initial_holding_prob": 1.0,
         "video_dir": video_dir,
+        "image_dir": image_dir
     })
     video_file = os.path.join(video_dir, "cover__123__demo__task0.mp4")
+    image_file = os.path.join(image_dir, "cover__123__demo__task0_image_0.png")
     env = CoverEnv()
     train_tasks = [t.task for t in env.get_train_tasks()]
     assert len(train_tasks) == 1
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     assert len(dataset.trajectories) == 1
     assert os.path.exists(video_file)
+    assert os.path.exists(image_file)
     shutil.rmtree(video_dir)
+    shutil.rmtree(image_dir)
     # Test demo collection with bilevel_plan_without_sim.
     utils.reset_config({
         "env": "cover",
@@ -214,7 +237,8 @@ def test_demo_dataset():
     env = CoverEnv()
     train_tasks = [t.task for t in env.get_train_tasks()]
     options = parse_config_included_options(env)
-    dataset = create_dataset(env, train_tasks, options)
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, options, predicates)
     assert 0 < len(dataset.trajectories) < 5
     # Use bilevel planning to collect data, but don't use otherwise.
     utils.reset_config({
@@ -230,7 +254,8 @@ def test_demo_dataset():
     env = CoverEnv()
     train_tasks = [t.task for t in env.get_train_tasks()]
     options = parse_config_included_options(env)
-    dataset = create_dataset(env, train_tasks, options)
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, options, predicates)
     assert len(dataset.trajectories) == 5
 
 
@@ -259,9 +284,10 @@ def test_demo_dataset_loading(num_train_tasks, load_data, demonstrator,
         shutil.rmtree(CFG.data_dir)
     env = CoverEnv()
     train_tasks = [t.task for t in env.get_train_tasks()]
+    predicates, _ = utils.parse_config_excluded_predicates(env)
     with expectation as e:
         dataset = create_dataset(env, train_tasks,
-                                 get_gt_options(env.get_name()))
+                                 get_gt_options(env.get_name()), predicates)
     if e is None:
         assert len(dataset.trajectories) == num_train_tasks
         assert all(traj.train_task_idx < len(train_tasks)
@@ -292,7 +318,9 @@ def test_demo_dataset_loading_tricky_case(num_train_tasks, load_data,
         shutil.rmtree(CFG.data_dir)
     env = BlocksEnv()
     train_tasks = [t.task for t in env.get_train_tasks()]
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     # Note the use of <= here rather than ==.
     assert len(dataset.trajectories) <= num_train_tasks
     assert all(traj.train_task_idx < len(train_tasks)
@@ -314,7 +342,9 @@ def test_demo_replay_dataset():
     })
     env = CoverEnv()
     train_tasks = [t.task for t in env.get_train_tasks()]
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     assert len(dataset.trajectories) == 5 + 3
     assert len(dataset.trajectories[-1].states) == 2
     assert len(dataset.trajectories[-1].actions) == 1
@@ -338,7 +368,8 @@ def test_demo_replay_dataset():
     env = CoverEnv()
     train_tasks = [t.task for t in env.get_train_tasks()]
     options = parse_config_included_options(env)
-    dataset = create_dataset(env, train_tasks, options)
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, options, predicates)
     assert len(dataset.trajectories) == 5 + 3
     assert len(dataset.trajectories[-1].states) == 2
     assert len(dataset.trajectories[-1].actions) == 1
@@ -368,7 +399,8 @@ def test_demo_replay_dataset():
     train_tasks = [t.task for t in env.get_train_tasks()]
     options = parse_config_included_options(env)
     assert options == {Pick}
-    dataset = create_dataset(env, train_tasks, options)
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, options, predicates)
     assert len(dataset.trajectories) == 3 + 3
     at_least_one_pick_found = False
     at_least_one_place_found = False
@@ -392,7 +424,9 @@ def test_demo_replay_dataset():
     })
     env = ClutteredTableEnv()
     train_tasks = [t.task for t in env.get_train_tasks()]
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     assert len(dataset.trajectories[-1].states) == 2
     assert len(dataset.trajectories[-1].actions) == 1
 
@@ -410,8 +444,10 @@ def test_dataset_with_annotations():
     })
     env = CoverEnv()
     train_tasks = [t.task for t in env.get_train_tasks()]
+    predicates, _ = utils.parse_config_excluded_predicates(env)
     trajectories = create_dataset(env, train_tasks,
-                                  get_gt_options(env.get_name())).trajectories
+                                  get_gt_options(env.get_name()),
+                                  predicates).trajectories
     # The annotations and trajectories need to be the same length.
     with pytest.raises(AssertionError):
         dataset = Dataset(trajectories, [])
@@ -440,7 +476,9 @@ def test_ground_atom_dataset():
     })
     env = CoverEnv()
     train_tasks = [t.task for t in env.get_train_tasks()]
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     assert len(dataset.trajectories) == 15
     assert len(dataset.annotations) == 15
     Covers, HandEmpty, Holding = _get_predicates_by_names(
@@ -491,8 +529,10 @@ def test_ground_atom_dataset():
     })
     env = CoverEnv()
     train_tasks = [t.task for t in env.get_train_tasks()]
+    predicates, _ = utils.parse_config_excluded_predicates(env)
     with pytest.raises(ValueError):
-        create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+        create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                       predicates)
 
 
 def test_empty_dataset():
@@ -503,12 +543,49 @@ def test_empty_dataset():
     })
     env = CoverEnv()
     train_tasks = [t.task for t in env.get_train_tasks()]
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     assert len(dataset.trajectories) == 0
     with pytest.raises(AssertionError):
         _ = dataset.annotations
 
 
+def test_loading_saved_vlm_img_demos_folder_non_dummy_goal():
+    """Test loading a dataset from img demo files."""
+    utils.reset_config({
+        "env": "cover",
+        "num_train_tasks": 1,
+        "offline_data_method": "saved_vlm_img_demos_folder",
+        "data_dir": "tests/datasets/mock_vlm_datasets",
+        "seed": 456,
+        "vlm_trajs_folder_name": "cover__vlm_demos__456__1",
+        "grammar_search_vlm_atom_proposal_prompt_type": "naive_each_step",
+        "grammar_search_vlm_atom_label_prompt_type": "per_scene_naive",
+        "pretrained_model_prompt_cache_dir":
+        "tests/datasets/mock_vlm_datasets/cache",
+        "cover_num_blocks": 1,
+        "cover_num_targets": 1,
+        "cover_block_widths": [0.1],
+        "cover_target_widths": [0.05],
+        "excluded_predicates": "all"
+    })
+    env = CoverEnv()
+    train_tasks = env.get_train_tasks()
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    vlm = _DummyVLM()
+    loaded_dataset = create_ground_atom_data_from_saved_img_trajs(
+        env, train_tasks, predicates, get_gt_options(env.get_name()), vlm)
+    assert len(loaded_dataset.trajectories) == 1
+    assert len(loaded_dataset.annotations) == 1
+    assert "DummyGoal" not in str(loaded_dataset.annotations[0][-1])
+    for dirpath, _, filenames in os.walk(
+            CFG.pretrained_model_prompt_cache_dir):
+        # Remove regular files, ignore directories
+        for filename in filenames:
+            os.unlink(os.path.join(dirpath, filename))
+
+
 @pytest.mark.parametrize(
     "atom_proposal_prompt_type, atom_labelling_prompt_type",
     [("naive_each_step", "per_scene_naive"),
@@ -516,8 +593,8 @@ def test_empty_dataset():
      ("naive_whole_traj", "per_scene_cot"),
      ("not_a_real_prompt_type", "per_scene_cot"),
      ("naive_whole_traj", "not_a_real_prompt_type")])
-def test_loading_img_demos(atom_proposal_prompt_type,
-                           atom_labelling_prompt_type):
+def test_loading_saved_vlm_img_demos_folder_dummy_goal(
+        atom_proposal_prompt_type, atom_labelling_prompt_type):
     """Test loading a dataset from img demo files."""
     utils.reset_config({
         "env":
@@ -525,7 +602,7 @@ def test_loading_img_demos(atom_proposal_prompt_type,
         "num_train_tasks":
         1,
         "offline_data_method":
-        "img_demos",
+        "saved_vlm_img_demos_folder",
         "data_dir":
         "tests/datasets/mock_vlm_datasets",
         "seed":
@@ -541,11 +618,12 @@ def test_loading_img_demos(atom_proposal_prompt_type,
     })
     env = IceTeaMakingEnv()
     train_tasks = env.get_train_tasks()
+    predicates, _ = utils.parse_config_excluded_predicates(env)
     vlm = _DummyVLM()
     if atom_proposal_prompt_type != "not_a_real_prompt_type" and \
         atom_labelling_prompt_type != "not_a_real_prompt_type":
-        loaded_dataset = create_ground_atom_data_from_img_trajs(
-            env, train_tasks, get_gt_options(env.get_name()), vlm)
+        loaded_dataset = create_ground_atom_data_from_saved_img_trajs(
+            env, train_tasks, predicates, get_gt_options(env.get_name()), vlm)
         assert len(loaded_dataset.trajectories) == 1
         assert len(loaded_dataset.annotations) == 1
         assert len(loaded_dataset.annotations[0][0]) == 1
@@ -553,8 +631,9 @@ def test_loading_img_demos(atom_proposal_prompt_type,
         assert "DummyGoal" in str(loaded_dataset.annotations[0][-1])
     else:
         with pytest.raises(ValueError) as e:
-            loaded_dataset = create_ground_atom_data_from_img_trajs(
-                env, train_tasks, get_gt_options(env.get_name()), vlm)
+            loaded_dataset = create_ground_atom_data_from_saved_img_trajs(
+                env, train_tasks, predicates, get_gt_options(env.get_name()),
+                vlm)
         assert "Unknown" in str(e)
     for dirpath, _, filenames in os.walk(
             CFG.pretrained_model_prompt_cache_dir):
@@ -569,7 +648,7 @@ def test_env_debug_grammar():
     utils.reset_config({
         "env": "ice_tea_making",
         "num_train_tasks": 1,
-        "offline_data_method": "img_demos",
+        "offline_data_method": "saved_vlm_img_demos_folder",
         "data_dir": "tests/datasets/mock_vlm_datasets",
         "seed": 456,
         "vlm_trajs_folder_name": "ice_tea_making__vlm_demos__456__1",
@@ -579,10 +658,11 @@ def test_env_debug_grammar():
         "grammar_search_vlm_atom_proposal_use_debug": True
     })
     env = IceTeaMakingEnv()
+    predicates, _ = utils.parse_config_excluded_predicates(env)
     train_tasks = env.get_train_tasks()
     vlm = _DummyVLM()
-    loaded_dataset = create_ground_atom_data_from_img_trajs(
-        env, train_tasks, get_gt_options(env.get_name()), vlm)
+    loaded_dataset = create_ground_atom_data_from_saved_img_trajs(
+        env, train_tasks, predicates, get_gt_options(env.get_name()), vlm)
     assert len(loaded_dataset.trajectories) == 1
     assert len(loaded_dataset.annotations) == 1
     assert len(loaded_dataset.annotations[0][0]) == 6
@@ -606,6 +686,54 @@ def test_loading_txt_files():
     })
     env = IceTeaMakingEnv()
     train_tasks = env.get_train_tasks()
+    predicates, _ = utils.parse_config_excluded_predicates(env)
     loaded_dataset = create_dataset(env, train_tasks,
-                                    get_gt_options(env.get_name()))
+                                    get_gt_options(env.get_name()), predicates)
     assert len(loaded_dataset.trajectories) == 1
+    utils.reset_config({
+        "env":
+        "cover",
+        "num_train_tasks":
+        1,
+        "offline_data_method":
+        "demo+labelled_atoms",
+        "data_dir":
+        "tests/datasets/mock_vlm_datasets",
+        "handmade_demo_filename":
+        "cover__demo+labelled_atoms__manual__1.txt"
+    })
+    env = CoverEnv()
+    train_tasks = env.get_train_tasks()
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    loaded_dataset = create_dataset(env, train_tasks,
+                                    get_gt_options(env.get_name()), predicates)
+    assert len(loaded_dataset.trajectories) == 1
+    assert len(loaded_dataset.trajectories[0].actions) == 2
+    assert loaded_dataset.trajectories[0].actions[0].get_option(
+    ).name == "PickPlace"
+
+
+def test_create_ground_atom_data_from_generated_demos():
+    """Tests for the create_ground_atom_data_from_generated_demos method."""
+    utils.reset_config({
+        "env": "cover",
+        "approach": "oracle",
+        "offline_data_method": "demo",
+        "offline_data_planning_timeout": 500,
+        "option_learner": "no_learning",
+        "num_train_tasks": 1,
+        "included_options": "PickPlace",
+        "excluded_predicates": "all",
+    })
+    env = CoverEnv()
+    train_tasks = [t.task for t in env.get_train_tasks()]
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    options = parse_config_included_options(env)
+    dataset = create_dataset(env, train_tasks, options, predicates)
+    assert len(dataset.trajectories) == 1
+    for state in dataset.trajectories[0].states:
+        state.simulator_state = [np.zeros((32, 32), dtype=np.uint8)]
+    vlm = _DummyVLM()
+    vlm_dataset = create_ground_atom_data_from_generated_demos(
+        dataset, env, predicates, train_tasks, vlm)
+    assert len(vlm_dataset.annotations) == 1
diff --git a/tests/datasets/test_vlm_predicate_img.jpg b/tests/datasets/test_vlm_predicate_img.jpg
new file mode 100644
index 0000000000..2c35504483
Binary files /dev/null and b/tests/datasets/test_vlm_predicate_img.jpg differ
diff --git a/tests/envs/test_burger.py b/tests/envs/test_burger.py
new file mode 100644
index 0000000000..fc2693c864
--- /dev/null
+++ b/tests/envs/test_burger.py
@@ -0,0 +1,231 @@
+"""Test cases for the burger environment."""
+
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import pytest
+
+from predicators import utils
+from predicators.envs.burger import BurgerEnv
+from predicators.ground_truth_models import get_gt_nsrts, get_gt_options
+from predicators.settings import CFG
+from predicators.structs import Action, GroundAtom
+
+
+def test_burger():
+    """Tests for BurgerEnv."""
+
+    utils.reset_config({
+        "env": "burger",
+        "option_model_terminate_on_repeat": False,
+        "sesame_max_skeletons_optimized": 1000,
+        "sesame_max_samples_per_step": 1,
+        "sesame_task_planner": "fdopt"
+    })
+
+    env = BurgerEnv()
+    for task in env.get_train_tasks():
+        for obj in task.init:
+            assert len(obj.type.feature_names) == len(task.init[obj])
+    for task in env.get_test_tasks():
+        for obj in task.init:
+            assert len(obj.type.feature_names) == len(task.init[obj])
+    assert len(env.predicates) == 12
+    assert len(env.goal_predicates) == 3
+    assert env.get_name() == "burger"
+    assert len(env.types) == 11
+    options = get_gt_options(env.get_name())
+    assert len(options) == 5
+    nsrts = get_gt_nsrts(env.get_name(), env.predicates, options)
+    assert len(nsrts) == 15
+    task = env.get_train_tasks()[0]
+    MoveWhenFacingOneStack = [
+        n for n in nsrts if n.name == "MoveWhenFacingOneStack"
+    ][0]
+    MoveWhenFacingTwoStack = [
+        n for n in nsrts if n.name == "MoveWhenFacingTwoStack"
+    ][0]
+    MoveWhenFacingThreeStack = [
+        n for n in nsrts if n.name == "MoveWhenFacingThreeStack"
+    ][0]
+    MoveWhenFacingFourStack = [
+        n for n in nsrts if n.name == "MoveWhenFacingFourStack"
+    ][0]
+    MoveFromNothingToOneStack = [
+        n for n in nsrts if n.name == "MoveFromNothingToOneStack"
+    ][0]
+    MoveFromNothingToTwoStack = [
+        n for n in nsrts if n.name == "MoveFromNothingToTwoStack"
+    ][0]
+    MoveFromNothingToFourStack = [
+        n for n in nsrts if n.name == "MoveFromNothingToFourStack"
+    ][0]
+    MoveFromOneStackToThreeStack = [
+        n for n in nsrts if n.name == "MoveFromOneStackToThreeStack"
+    ][0]
+    PickSingleAdjacent = [n for n in nsrts
+                          if n.name == "PickSingleAdjacent"][0]
+    PickFromStack = [n for n in nsrts if n.name == "PickFromStack"][0]
+    Place = [n for n in nsrts if n.name == "Place"][0]
+    Cook = [n for n in nsrts if n.name == "Cook"][0]
+    Slice = [n for n in nsrts if n.name == "Slice"][0]
+
+    grill = [obj for obj in task.init if obj.name == "grill"][0]
+    patty = [obj for obj in task.init if obj.name == "patty"][0]
+    robot = [obj for obj in task.init if obj.name == "robby"][0]
+    tomato = [obj for obj in task.init if obj.name == "tomato"][0]
+    cutting_board = [obj for obj in task.init
+                     if obj.name == "cutting_board"][0]
+    cheese = [obj for obj in task.init if obj.name == "cheese"][0]
+    top_bun = [obj for obj in task.init if obj.name == "top_bun"][0]
+    bottom_bun = [obj for obj in task.init if obj.name == "bottom_bun"][0]
+
+    plan = [
+        MoveWhenFacingOneStack.ground([robot, patty, grill]),
+        PickSingleAdjacent.ground([robot, patty]),
+        MoveFromNothingToOneStack.ground([robot, grill]),
+        Place.ground([robot, patty, grill]),
+        Cook.ground([robot, patty, grill]),
+        PickFromStack.ground([robot, patty, grill]),
+        MoveWhenFacingOneStack.ground([robot, bottom_bun, grill]),
+        Place.ground([robot, patty, bottom_bun]),
+        MoveWhenFacingTwoStack.ground([robot, cheese, patty, bottom_bun]),
+        PickSingleAdjacent.ground([robot, cheese]),
+        MoveFromNothingToTwoStack.ground([robot, patty, bottom_bun]),
+        Place.ground([robot, cheese, patty]),
+        MoveWhenFacingThreeStack.ground(
+            [robot, tomato, cheese, patty, bottom_bun]),
+        PickSingleAdjacent.ground([robot, tomato]),
+        MoveFromNothingToOneStack.ground([robot, cutting_board]),
+        Place.ground([robot, tomato, cutting_board]),
+        Slice.ground([robot, tomato, cutting_board]),
+        PickFromStack.ground([robot, tomato, cutting_board]),
+        MoveFromOneStackToThreeStack.ground(
+            [robot, cheese, patty, bottom_bun, cutting_board]),
+        Place.ground([robot, tomato, cheese]),
+        MoveWhenFacingFourStack.ground(
+            [robot, top_bun, tomato, cheese, patty, bottom_bun]),
+        PickSingleAdjacent.ground([robot, top_bun]),
+        MoveFromNothingToFourStack.ground(
+            [robot, tomato, cheese, patty, bottom_bun]),
+        Place.ground([robot, top_bun, tomato])
+    ]
+
+    option_plan = [n.option.ground(n.option_objs, []) for n in plan]
+    policy = utils.option_plan_to_policy(option_plan)
+    traj, _ = utils.run_policy(policy,
+                               env,
+                               "train",
+                               0,
+                               termination_function=lambda s: False,
+                               max_num_steps=CFG.horizon,
+                               exceptions_to_break_on={
+                                   utils.OptionExecutionFailure,
+                                   utils.HumanDemonstrationFailure,
+                               },
+                               monitor=None)
+
+    # Test _AdjacentToNothing_holds
+    state = task.init
+    state.set(robot, "col", 1)
+    state.set(top_bun, "col", 2)
+    abstract_state = utils.abstract(state, env.predicates)
+    AdjacentToNothing = [
+        p for p in env.predicates if p.name == "AdjacentToNothing"
+    ][0]
+    assert GroundAtom(AdjacentToNothing, [robot]) in abstract_state
+
+    # Test _OnNothing_holds
+    OnNothing = [p for p in env.predicates if p.name == "OnNothing"][0]
+    assert GroundAtom(OnNothing,
+                      [top_bun]) not in utils.abstract(traj.states[-1],
+                                                       env.predicates)
+
+    # Test _GoalHack_holds
+    GoalHack = [p for p in env.predicates if p.name == "GoalHack"][0]
+    assert GroundAtom(GoalHack, [bottom_bun, patty, cheese, tomato, top_bun
+                                 ]) in utils.abstract(traj.states[-1],
+                                                      env.predicates)
+
+    # Test get_cell_in_direction
+    x, y = env.get_cell_in_direction(1, 1, "left")
+    assert x == 0 and y == 1
+    x, y = env.get_cell_in_direction(1, 1, "up")
+    assert x == 1 and y == 2
+    x, y = env.get_cell_in_direction(1, 1, "no_change")
+    assert x == 1 and y == 1
+
+    # Test collision
+    state.set(robot, "col", 2)  # robot is at (2, 2)
+    action = Action(np.array([1, 0, -1, 0, 0], dtype=np.float32))
+    next_state = env.simulate(state, action)
+    assert env.get_position(robot,
+                            next_state) == env.get_position(robot, state)
+
+    # Test placing on the ground
+    state = traj.states[5]
+    action = Action(np.array([0, 0, -1, 0, 1], dtype=np.float32))
+    next_state = env.simulate(state, action)
+    assert env.get_position(patty, next_state) == env.get_position(
+        patty, traj.states[4])
+    assert next_state.get(patty, "z") == 0
+
+    # Test rendering
+    env.render_state_plt(traj.states[0], task)
+    env.render_state_plt(traj.states[5], task)
+    env.render_state_plt(traj.states[-1], task)
+
+    # Test interface for collecting demonstrations
+    event_to_action = env.get_event_to_action_fn()
+    fig = plt.figure()
+    event = matplotlib.backend_bases.KeyEvent("test", fig.canvas, "asdf")
+    assert isinstance(event_to_action(state, event), Action)
+    event = matplotlib.backend_bases.KeyEvent("test", fig.canvas, "q")
+    with pytest.raises(utils.HumanDemonstrationFailure):
+        event_to_action(state, event)
+    for key in ["w", "a", "s", "d", "left", "right", "down", "up", "e", "f"]:
+        event = matplotlib.backend_bases.KeyEvent("test", fig.canvas, key)
+        event_to_action(state, event)
+    plt.close()
+
+    # Test move option when already adjacent but not facing
+    state = task.init
+    state.set(grill, "col", 2)
+    state.set(grill, "row", 3)
+    Move = [o for o in options if o.name == "Move"][0]
+    option = Move.ground([robot, grill], [])
+    assert option.initiable(state)
+    action = option.policy(state)
+    next_state = env.step(action)
+    assert next_state.get(robot, "dir") == 0
+
+    state = task.init
+    state.set(grill, "col", 2)
+    state.set(grill, "row", 1)
+    Move = [o for o in options if o.name == "Move"][0]
+    option = Move.ground([robot, grill], [])
+    assert option.initiable(state)
+    action = option.policy(state)
+    next_state = env.step(action)
+    assert next_state.get(robot, "dir") == 2
+
+    state = task.init
+    state.set(grill, "col", 1)
+    state.set(grill, "row", 2)
+    Move = [o for o in options if o.name == "Move"][0]
+    option = Move.ground([robot, grill], [])
+    assert option.initiable(state)
+    action = option.policy(state)
+    next_state = env.step(action)
+    assert next_state.get(robot, "dir") == 1
+
+    state = task.init
+    state.set(grill, "col", 3)
+    state.set(grill, "row", 2)
+    state.set(robot, "dir", 1)
+    Move = [o for o in options if o.name == "Move"][0]
+    option = Move.ground([robot, grill], [])
+    assert option.initiable(state)
+    action = option.policy(state)
+    next_state = env.step(action)
+    assert next_state.get(robot, "dir") == 3
diff --git a/tests/envs/test_cover.py b/tests/envs/test_cover.py
index d8322903ce..575f6b4b22 100644
--- a/tests/envs/test_cover.py
+++ b/tests/envs/test_cover.py
@@ -763,3 +763,22 @@ def test_regional_bumpy_cover_env():
         act = policy(state)
         next_state = env.simulate(state, act)
         assert state.allclose(next_state)
+
+
+def test_debug_atoms_vlm_str():
+    """Tests the get_vlm_debug_atom_strs method."""
+    utils.reset_config({
+        "env": "cover",
+        "excluded_predicates": "all",
+        "num_train_tasks": 1
+    })
+    env = create_new_env("cover")
+    tasks = env.get_train_tasks()
+    debug_atoms_strs = env.get_vlm_debug_atom_strs(tasks)
+    gt_debug_strs = [
+        'Holding(block0)', 'IsBlock(block1)', 'IsTarget(target0)',
+        'Holding(block1)', 'HandEmpty()', 'IsTarget(target1)',
+        'IsBlock(block0)'
+    ]
+    for debug_str in gt_debug_strs:
+        assert debug_str in debug_atoms_strs
diff --git a/tests/explorers/test_online_learning.py b/tests/explorers/test_online_learning.py
index be8db3a5ea..6ca5f38876 100644
--- a/tests/explorers/test_online_learning.py
+++ b/tests/explorers/test_online_learning.py
@@ -114,7 +114,9 @@ def test_interaction():
     perceiver = create_perceiver("trivial")
     exec_monitor = create_execution_monitor("trivial")
     cogman = CogMan(approach, perceiver, exec_monitor)
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     _run_pipeline(env, cogman, train_tasks, dataset)
     utils.update_config({
         "approach": "bridge_policy",
diff --git a/tests/nsrt_learning/strips_learning/test_clustering_learner.py b/tests/nsrt_learning/strips_learning/test_clustering_learner.py
index 87fc555acc..dcdb3f350b 100644
--- a/tests/nsrt_learning/strips_learning/test_clustering_learner.py
+++ b/tests/nsrt_learning/strips_learning/test_clustering_learner.py
@@ -180,6 +180,79 @@ def _simulate(s, a):
     Delete Effects: [Pred0(?x1:cup_type), Pred1(?x1:cup_type, ?x0:cup_type), Pred1(?x1:cup_type, ?x1:cup_type), Pred1(?x1:cup_type, ?x2:cup_type), Pred2(?x1:cup_type)]
     Ignore Effects: []"""  # pylint: disable=line-too-long
 
+    # Test pruning pnads with small datastores. The set up is copied from the
+    # next test, test_cluster_and_search_strips_learner().
+    obj_type = Type("obj_type", ["red", "green", "blue", "happy", "sad"])
+    obj = obj_type("obj")
+    IsRed = Predicate("IsRed", [obj_type], lambda s, o: s[o[0]][0] > 0.5)
+    IsGreen = Predicate("IsGreen", [obj_type], lambda s, o: s[o[0]][1] > 0.5)
+    IsBlue = Predicate("IsBlue", [obj_type], lambda s, o: s[o[0]][2] > 0.5)
+    IsHappy = Predicate("IsHappy", [obj_type], lambda s, o: s[o[0]][3] > 0.5)
+    IsSad = Predicate("IsSad", [obj_type], lambda s, o: s[o[0]][4] > 0.5)
+    preds = {IsRed, IsGreen, IsBlue, IsHappy, IsSad}
+    Interact = utils.SingletonParameterizedOption(
+        "Interact", lambda s, m, o, p: None).ground([], [])
+    # We give three demonstrations. When the object is red or green, it
+    # becomes happy. When the object is blue, it becomes sad.
+    s1 = State({obj: [1.0, 0.0, 0.0, 0.0, 0.0]})
+    a1 = Action([], Interact)
+    ns1 = State({obj: [1.0, 0.0, 0.0, 1.0, 0.0]})
+    g1 = {IsHappy([obj])}
+    traj1 = LowLevelTrajectory([s1, ns1], [a1], True, 0)
+    task1 = Task(s1, g1)
+    segment1 = Segment(traj1, utils.abstract(s1, preds),
+                       utils.abstract(ns1, preds), Interact)
+    s2 = State({obj: [0.0, 1.0, 0.0, 0.0, 0.0]})
+    a2 = Action([], Interact)
+    ns2 = State({obj: [0.0, 1.0, 0.0, 1.0, 0.0]})
+    g2 = {IsHappy([obj])}
+    traj2 = LowLevelTrajectory([s2, ns2], [a2], True, 1)
+    task2 = Task(s2, g2)
+    segment2 = Segment(traj2, utils.abstract(s2, preds),
+                       utils.abstract(ns2, preds), Interact)
+    s3 = State({obj: [0.0, 0.0, 1.0, 0.0, 0.0]})
+    a3 = Action([], Interact)
+    ns3 = State({obj: [0.0, 0.0, 1.0, 0.0, 1.0]})
+    g3 = {IsSad([obj])}
+    traj3 = LowLevelTrajectory([s3, ns3], [a3], True, 2)
+    task3 = Task(s3, g3)
+    segment3 = Segment(traj3, utils.abstract(s3, preds),
+                       utils.abstract(ns3, preds), Interact)
+    utils.reset_config({
+        "strips_learner": "cluster_and_intersect",
+        "cluster_and_intersect_prune_low_data_pnads": True,
+        "cluster_and_intersect_min_datastore_fraction": 0.0
+    })
+    pnads = learn_strips_operators([traj1, traj2, traj3],
+                                   [task1, task2, task3],
+                                   preds, [[segment1], [segment2], [segment3]],
+                                   verify_harmlessness=False,
+                                   annotations=None)
+    assert len(pnads) == 2
+    utils.reset_config({
+        "strips_learner": "cluster_and_intersect",
+        "cluster_and_intersect_prune_low_data_pnads": True,
+        "cluster_and_intersect_min_datastore_fraction": 0.4
+    })
+    pnads = learn_strips_operators([traj1, traj2, traj3],
+                                   [task1, task2, task3],
+                                   preds, [[segment1], [segment2], [segment3]],
+                                   verify_harmlessness=False,
+                                   annotations=None)
+    assert len(pnads) == 1
+    assert len(pnads[0].datastore) == 2
+    utils.reset_config({
+        "strips_learner": "cluster_and_intersect",
+        "cluster_and_intersect_prune_low_data_pnads": True,
+        "cluster_and_intersect_min_datastore_fraction": 0.9
+    })
+    pnads = learn_strips_operators([traj1, traj2, traj3],
+                                   [task1, task2, task3],
+                                   preds, [[segment1], [segment2], [segment3]],
+                                   verify_harmlessness=False,
+                                   annotations=None)
+    assert len(pnads) == 0
+
 
 def test_cluster_and_search_strips_learner():
     """Tests for ClusterAndSearchSTRIPSLearner."""
diff --git a/tests/nsrt_learning/strips_learning/test_oracle_learner.py b/tests/nsrt_learning/strips_learning/test_oracle_learner.py
index 1f358faad5..c235018302 100644
--- a/tests/nsrt_learning/strips_learning/test_oracle_learner.py
+++ b/tests/nsrt_learning/strips_learning/test_oracle_learner.py
@@ -25,7 +25,9 @@ def test_oracle_strips_learner():
     # With sufficiently representative data, all operators should be learned.
     env = create_new_env("blocks")
     train_tasks = [t.task for t in env.get_train_tasks()]
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     segmented_trajs = [
         segment_trajectory(t, env.predicates) for t in dataset.trajectories
     ]
@@ -71,7 +73,8 @@ def test_oracle_strips_learner():
     })
     env = create_new_env("blocks")
     train_tasks = [t.task for t in env.get_train_tasks()]
-    dataset = create_dataset(env, train_tasks, set())
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, set(), predicates)
     segmented_trajs = [
         segment_trajectory(t, env.predicates) for t in dataset.trajectories
     ]
@@ -115,7 +118,9 @@ def test_oracle_strips_learner():
     })
     env = create_new_env("cover_multistep_options")
     train_tasks = [t.task for t in env.get_train_tasks()]
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     # Truncate dataset.
     state, next_state = dataset.trajectories[0].states[:2]
     action = dataset.trajectories[0].actions[0]
diff --git a/tests/nsrt_learning/test_option_learning.py b/tests/nsrt_learning/test_option_learning.py
index 18c094a462..9fb2015191 100644
--- a/tests/nsrt_learning/test_option_learning.py
+++ b/tests/nsrt_learning/test_option_learning.py
@@ -311,7 +311,11 @@ def test_option_learning_approach_multistep_cover():
     approach = create_approach("nsrt_learning", env.predicates,
                                get_gt_options(env.get_name()), env.types,
                                env.action_space, train_tasks)
-    dataset = create_dataset(env, train_tasks, known_options=set())
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env,
+                             train_tasks,
+                             known_options=set(),
+                             known_predicates=predicates)
     assert approach.is_learning_based
     approach.learn_from_offline_dataset(dataset)
     num_test_successes = 0
@@ -348,7 +352,11 @@ def test_implicit_bc_option_learning_touch_point():
     approach = create_approach("nsrt_learning", env.predicates,
                                get_gt_options(env.get_name()), env.types,
                                env.action_space, train_tasks)
-    dataset = create_dataset(env, train_tasks, known_options=set())
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env,
+                             train_tasks,
+                             known_options=set(),
+                             known_predicates=predicates)
     assert approach.is_learning_based
     approach.learn_from_offline_dataset(dataset)
     num_test_successes = 0
@@ -400,7 +408,11 @@ def test_action_conversion():
         train_tasks = [t.task for t in env.get_train_tasks()]
         approach = create_approach("nsrt_learning", env.predicates, set(),
                                    env.types, env.action_space, train_tasks)
-        dataset = create_dataset(env, train_tasks, known_options=set())
+        predicates, _ = utils.parse_config_excluded_predicates(env)
+        dataset = create_dataset(env,
+                                 train_tasks,
+                                 known_options=set(),
+                                 known_predicates=predicates)
         approach.learn_from_offline_dataset(dataset)
         task = env.get_test_tasks()[0]
         robot, target = sorted(list(task.init))
diff --git a/tests/nsrt_learning/test_segmentation.py b/tests/nsrt_learning/test_segmentation.py
index 58626d724b..3250811e97 100644
--- a/tests/nsrt_learning/test_segmentation.py
+++ b/tests/nsrt_learning/test_segmentation.py
@@ -183,7 +183,11 @@ def _simulate(s, a):
     env = create_new_env("cover_multistep_options", do_cache=False)
     train_tasks = [t.task for t in env.get_train_tasks()]
     assert len(train_tasks) == 1
-    dataset = create_dataset(env, train_tasks, known_options=set())
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env,
+                             train_tasks,
+                             known_options=set(),
+                             known_predicates=predicates)
     ground_atom_dataset = utils.create_ground_atom_dataset(
         dataset.trajectories, env.predicates)
     assert len(ground_atom_dataset) == 1
@@ -229,7 +233,9 @@ def test_contact_based_segmentation(env):
     env = create_new_env(env, do_cache=False)
     train_tasks = [t.task for t in env.get_train_tasks()]
     assert len(train_tasks) == 1
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     ground_atom_dataset = utils.create_ground_atom_dataset(
         dataset.trajectories, env.predicates)
     assert len(ground_atom_dataset) == 1
diff --git a/tests/test_cogman.py b/tests/test_cogman.py
index d16004d1c1..6e05211580 100644
--- a/tests/test_cogman.py
+++ b/tests/test_cogman.py
@@ -1,15 +1,20 @@
 """Tests for cogman.py."""
 
+import time
+from typing import Any, List
+
 import numpy as np
 import pytest
 
 from predicators import utils
 from predicators.approaches import create_approach
-from predicators.cogman import CogMan
+from predicators.cogman import CogMan, run_episode_and_get_observations
 from predicators.envs import get_or_create_env
+from predicators.envs.cover import CoverEnv
 from predicators.execution_monitoring import create_execution_monitor
 from predicators.ground_truth_models import get_gt_options
 from predicators.perception import create_perceiver
+from predicators.structs import Action, DefaultState
 
 
 @pytest.mark.parametrize("exec_monitor_name", ["trivial", "mpc"])
@@ -72,3 +77,158 @@ def test_cogman_with_expected_atoms_monitor():
     next_obs = env.step(act)
     next_act = cogman.step(next_obs)
     assert not np.allclose(act.arr, next_act.arr)
+
+
+def test_run_episode_and_get_observations():
+    """Tests for run_episode_and_get_observations()."""
+    utils.reset_config({"env": "cover"})
+    env = CoverEnv()
+    train_tasks = [t.task for t in env.get_train_tasks()]
+    cover_options = get_gt_options(env.get_name())
+    task = env.get_task("test", 0)
+    approach = create_approach("random_options", env.predicates, cover_options,
+                               env.types, env.action_space, train_tasks)
+    perceiver = create_perceiver("trivial")
+    exec_monitor = create_execution_monitor("trivial")
+    cogman = CogMan(approach, perceiver, exec_monitor)
+    cogman.reset(task)
+    (states, actions), solved, metrics = run_episode_and_get_observations(
+        cogman, env, "test", 0, max_num_steps=5)
+    assert not solved
+    assert len(states) == 6
+    assert len(actions) == 5
+    assert "policy_call_time" in metrics
+    assert metrics["policy_call_time"] > 0.0
+    assert metrics["num_options_executed"] > 0.0
+
+    # Test exceptions_to_break_on.
+    def _value_error_policy(_):
+        raise ValueError("mock error")
+
+    class _MockApproach:
+
+        def __init__(self, policy):
+            self._policy = policy
+
+        def solve(self, task, timeout):
+            """Just use the given policy."""
+            del task, timeout  # unused
+            return self._policy
+
+        def get_execution_monitoring_info(self) -> List[Any]:
+            """Just return empty list."""
+            return []
+
+    class _CountingMonitor(utils.LoggingMonitor):
+
+        def __init__(self):
+            self.num_observations = 0
+
+        def reset(self, train_or_test, task_idx):
+            self.num_observations = 0
+
+        def observe(self, obs, action):
+            self.num_observations += 1
+
+    approach = _MockApproach(_value_error_policy)
+    cogman = CogMan(approach, perceiver, exec_monitor)
+    cogman.reset(task)
+
+    with pytest.raises(ValueError) as e:
+        _, _, _ = run_episode_and_get_observations(cogman,
+                                                   env,
+                                                   "test",
+                                                   0,
+                                                   max_num_steps=5)
+    assert "mock error" in str(e)
+
+    monitor = _CountingMonitor()
+    (states, _), _, _ = run_episode_and_get_observations(
+        cogman,
+        env,
+        "test",
+        0,
+        max_num_steps=5,
+        exceptions_to_break_on={ValueError},
+        monitor=monitor)
+
+    assert len(states) == 1
+    assert monitor.num_observations == 1
+
+    class _MockEnv:
+
+        @staticmethod
+        def reset(train_or_test, task_idx):
+            """Reset the mock environment."""
+            del train_or_test, task_idx  # unused
+            return DefaultState
+
+        @staticmethod
+        def step(action):
+            """Step the mock environment."""
+            del action  # unused
+            raise utils.EnvironmentFailure("mock failure")
+
+        def get_observation(self):
+            """Gets currrent observation in mock environment."""
+            return DefaultState
+
+        def goal_reached(self):
+            """Goal never reached."""
+            return False
+
+    mock_env = _MockEnv()
+    ones_policy = lambda _: Action(np.zeros(1, dtype=np.float32))
+    approach = _MockApproach(ones_policy)
+    cogman = CogMan(approach, perceiver, exec_monitor)
+    cogman.reset(task)
+    monitor = _CountingMonitor()
+    (states, actions), _, _ = run_episode_and_get_observations(
+        cogman,
+        mock_env,
+        "test",
+        0,
+        max_num_steps=5,
+        exceptions_to_break_on={utils.EnvironmentFailure},
+        monitor=monitor)
+    assert len(states) == 1
+    assert len(actions) == 0
+    assert monitor.num_observations == 1
+
+    # Test policy call time.
+    def _policy(_):
+        time.sleep(0.1)
+        return Action(env.action_space.sample())
+
+    approach = _MockApproach(_policy)
+    cogman = CogMan(approach, perceiver, exec_monitor)
+    cogman.reset(task)
+
+    _, _, metrics = run_episode_and_get_observations(cogman,
+                                                     env,
+                                                     "test",
+                                                     0,
+                                                     max_num_steps=3)
+    assert metrics["policy_call_time"] >= 3 * 0.1
+    assert metrics["num_options_executed"] == 0
+
+    # Test with monitor in case where an uncaught exception is raised.
+
+    def _policy(_):
+        raise ValueError("mock error")
+
+    monitor = _CountingMonitor()
+    approach = _MockApproach(_policy)
+    cogman = CogMan(approach, perceiver, exec_monitor)
+    cogman.reset(task)
+
+    try:
+        run_episode_and_get_observations(cogman,
+                                         mock_env,
+                                         "test",
+                                         0,
+                                         max_num_steps=3,
+                                         monitor=monitor)
+    except ValueError:
+        pass
+    assert monitor.num_observations == 1
diff --git a/tests/test_main.py b/tests/test_main.py
index 9891d3eafd..4db7a2e190 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -3,10 +3,8 @@
 import shutil
 import sys
 import tempfile
-import time
-from typing import Any, Callable, List
+from typing import Callable
 
-import numpy as np
 import pytest
 
 import predicators.ground_truth_models
@@ -17,9 +15,9 @@
 from predicators.envs.cover import CoverEnv
 from predicators.execution_monitoring import create_execution_monitor
 from predicators.ground_truth_models import get_gt_options
-from predicators.main import _run_episode, _run_testing, main
+from predicators.main import _run_testing, main
 from predicators.perception import create_perceiver
-from predicators.structs import Action, DefaultState, State, Task
+from predicators.structs import Action, State, Task
 
 _GROUND_TRUTH_MODULE_PATH = predicators.ground_truth_models.__name__
 
@@ -284,152 +282,3 @@ def test_env_failure():
     exec_monitor = create_execution_monitor("trivial")
     cogman = CogMan(approach, perceiver, exec_monitor)
     _run_testing(env, cogman)
-
-
-def test_run_episode():
-    """Tests for _run_episode()."""
-    utils.reset_config({"env": "cover"})
-    env = CoverEnv()
-    train_tasks = [t.task for t in env.get_train_tasks()]
-    cover_options = get_gt_options(env.get_name())
-    task = env.get_task("test", 0)
-    approach = create_approach("random_options", env.predicates, cover_options,
-                               env.types, env.action_space, train_tasks)
-    perceiver = create_perceiver("trivial")
-    exec_monitor = create_execution_monitor("trivial")
-    cogman = CogMan(approach, perceiver, exec_monitor)
-    cogman.reset(task)
-    (states, actions), solved, metrics = _run_episode(cogman,
-                                                      env,
-                                                      "test",
-                                                      0,
-                                                      max_num_steps=5)
-    assert not solved
-    assert len(states) == 6
-    assert len(actions) == 5
-    assert "policy_call_time" in metrics
-    assert metrics["policy_call_time"] > 0.0
-    assert metrics["num_options_executed"] > 0.0
-
-    # Test exceptions_to_break_on.
-    def _value_error_policy(_):
-        raise ValueError("mock error")
-
-    class _MockApproach:
-
-        def __init__(self, policy):
-            self._policy = policy
-
-        def solve(self, task, timeout):
-            """Just use the given policy."""
-            del task, timeout  # unused
-            return self._policy
-
-        def get_execution_monitoring_info(self) -> List[Any]:
-            """Just return empty list."""
-            return []
-
-    class _CountingMonitor(utils.LoggingMonitor):
-
-        def __init__(self):
-            self.num_observations = 0
-
-        def reset(self, train_or_test, task_idx):
-            self.num_observations = 0
-
-        def observe(self, obs, action):
-            self.num_observations += 1
-
-    approach = _MockApproach(_value_error_policy)
-    cogman = CogMan(approach, perceiver, exec_monitor)
-    cogman.reset(task)
-
-    with pytest.raises(ValueError) as e:
-        _, _, _ = _run_episode(cogman, env, "test", 0, max_num_steps=5)
-    assert "mock error" in str(e)
-
-    monitor = _CountingMonitor()
-    (states, _), _, _ = _run_episode(cogman,
-                                     env,
-                                     "test",
-                                     0,
-                                     max_num_steps=5,
-                                     exceptions_to_break_on={ValueError},
-                                     monitor=monitor)
-
-    assert len(states) == 1
-    assert monitor.num_observations == 1
-
-    class _MockEnv:
-
-        @staticmethod
-        def reset(train_or_test, task_idx):
-            """Reset the mock environment."""
-            del train_or_test, task_idx  # unused
-            return DefaultState
-
-        @staticmethod
-        def step(action):
-            """Step the mock environment."""
-            del action  # unused
-            raise utils.EnvironmentFailure("mock failure")
-
-        def get_observation(self):
-            """Gets currrent observation in mock environment."""
-            return DefaultState
-
-        def goal_reached(self):
-            """Goal never reached."""
-            return False
-
-    mock_env = _MockEnv()
-    ones_policy = lambda _: Action(np.zeros(1, dtype=np.float32))
-    approach = _MockApproach(ones_policy)
-    cogman = CogMan(approach, perceiver, exec_monitor)
-    cogman.reset(task)
-    monitor = _CountingMonitor()
-    (states, actions), _, _ = _run_episode(
-        cogman,
-        mock_env,
-        "test",
-        0,
-        max_num_steps=5,
-        exceptions_to_break_on={utils.EnvironmentFailure},
-        monitor=monitor)
-    assert len(states) == 1
-    assert len(actions) == 0
-    assert monitor.num_observations == 1
-
-    # Test policy call time.
-    def _policy(_):
-        time.sleep(0.1)
-        return Action(env.action_space.sample())
-
-    approach = _MockApproach(_policy)
-    cogman = CogMan(approach, perceiver, exec_monitor)
-    cogman.reset(task)
-
-    _, _, metrics = _run_episode(cogman, env, "test", 0, max_num_steps=3)
-    assert metrics["policy_call_time"] >= 3 * 0.1
-    assert metrics["num_options_executed"] == 0
-
-    # Test with monitor in case where an uncaught exception is raised.
-
-    def _policy(_):
-        raise ValueError("mock error")
-
-    monitor = _CountingMonitor()
-    approach = _MockApproach(_policy)
-    cogman = CogMan(approach, perceiver, exec_monitor)
-    cogman.reset(task)
-
-    try:
-        _run_episode(cogman,
-                     mock_env,
-                     "test",
-                     0,
-                     max_num_steps=3,
-                     monitor=monitor)
-    except ValueError:
-        pass
-    assert monitor.num_observations == 1
diff --git a/tests/test_predicate_search_score_functions.py b/tests/test_predicate_search_score_functions.py
index 13a7d0f327..fcc3c9c384 100644
--- a/tests/test_predicate_search_score_functions.py
+++ b/tests/test_predicate_search_score_functions.py
@@ -1,4 +1,5 @@
 """Tests for PredicateSearchScoreFunction classes."""
+import os
 from typing import Callable, FrozenSet, List, Set
 
 import numpy as np
@@ -156,7 +157,9 @@ def test_prediction_error_score_function():
     candidates = {p: 1.0 for p in name_to_pred.values()}
     env_train_tasks = env.get_train_tasks()
     train_tasks = [t.task for t in env_train_tasks]
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     atom_dataset = utils.create_ground_atom_dataset(dataset.trajectories,
                                                     env.predicates)
     score_function = _PredictionErrorScoreFunction(initial_predicates,
@@ -191,7 +194,9 @@ def test_hadd_match_score_function():
     candidates = {p: 1.0 for p in name_to_pred.values()}
     env_train_tasks = env.get_train_tasks()
     train_tasks = [t.task for t in env_train_tasks]
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     atom_dataset = utils.create_ground_atom_dataset(dataset.trajectories,
                                                     env.predicates)
     score_function = _RelaxationHeuristicMatchBasedScoreFunction(
@@ -222,7 +227,9 @@ def test_relaxation_energy_score_function():
     candidates = {p: 1.0 for p in name_to_pred.values()}
     env_train_tasks = env.get_train_tasks()
     train_tasks = [t.task for t in env_train_tasks]
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     atom_dataset = utils.create_ground_atom_dataset(dataset.trajectories,
                                                     env.predicates)
     score_function = _RelaxationHeuristicEnergyBasedScoreFunction(
@@ -317,6 +324,9 @@ def test_exact_energy_score_function():
     """Tests for _ExactHeuristicEnergyBasedScoreFunction()."""
     # Just test this on BlocksEnv, since that's a known problem case
     # for hadd_energy_lookaheaddepth*.
+    # NOTE: without this below dummy API key, utils.flush_cache()
+    # produces a nasty openai error...
+    os.environ["OPENAI_API_KEY"] = "dummy API key"
     utils.flush_cache()
     utils.reset_config({
         "env": "blocks",
@@ -335,7 +345,9 @@ def test_exact_energy_score_function():
     candidates = {p: 1.0 for p in name_to_pred.values()}
     env_train_tasks = env.get_train_tasks()
     train_tasks = [t.task for t in env_train_tasks]
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     atom_dataset = utils.create_ground_atom_dataset(dataset.trajectories,
                                                     env.predicates)
     score_function = _ExactHeuristicEnergyBasedScoreFunction(
@@ -375,7 +387,9 @@ def test_exact_energy_score_function():
 def test_count_score_functions():
     """Tests for _RelaxationHeuristicCountBasedScoreFunction() and
     _ExactHeuristicCountBasedScoreFunction."""
-
+    # NOTE: without this below dummy API key, utils.flush_cache()
+    # produces a nasty openai error...
+    os.environ["OPENAI_API_KEY"] = "dummy API key"
     utils.flush_cache()
     utils.reset_config({
         "env": "cover",
@@ -401,7 +415,9 @@ def test_count_score_functions():
     candidates[NotHandEmpty] = 1.0
     env_train_tasks = env.get_train_tasks()
     train_tasks = [t.task for t in env_train_tasks]
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     atom_dataset = utils.create_ground_atom_dataset(dataset.trajectories,
                                                     env.predicates)
     for name in ["exact_count", "lmcut_count_lookaheaddepth0"]:
@@ -452,7 +468,9 @@ def test_branching_factor_score_function():
     }
     env_train_tasks = env.get_train_tasks()
     train_tasks = [t.task for t in env_train_tasks]
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     atom_dataset = utils.create_ground_atom_dataset(
         dataset.trajectories, env.goal_predicates | set(candidates))
     score_function = _BranchingFactorScoreFunction(env.goal_predicates,
@@ -487,7 +505,9 @@ def test_task_planning_score_function():
     }
     env_train_tasks = env.get_train_tasks()
     train_tasks = [t.task for t in env_train_tasks]
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     atom_dataset = utils.create_ground_atom_dataset(
         dataset.trajectories, env.goal_predicates | set(candidates))
     score_function = _TaskPlanningScoreFunction(env.goal_predicates,
@@ -533,8 +553,9 @@ def test_expected_nodes_score_function():
         }
         env_train_tasks = env.get_train_tasks()
         train_tasks = [t.task for t in env_train_tasks]
+        predicates, _ = utils.parse_config_excluded_predicates(env)
         dataset = create_dataset(env, train_tasks,
-                                 get_gt_options(env.get_name()))
+                                 get_gt_options(env.get_name()), predicates)
         atom_dataset = utils.create_ground_atom_dataset(
             dataset.trajectories, env.goal_predicates | set(candidates))
         score_function = _ExpectedNodesScoreFunction(
@@ -566,7 +587,9 @@ def test_expected_nodes_score_function():
     })
     env_train_tasks = env.get_train_tasks()
     train_tasks = [t.task for t in env_train_tasks]
-    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()))
+    predicates, _ = utils.parse_config_excluded_predicates(env)
+    dataset = create_dataset(env, train_tasks, get_gt_options(env.get_name()),
+                             predicates)
     atom_dataset = utils.create_ground_atom_dataset(
         dataset.trajectories, env.goal_predicates | set(candidates))
     score_function = _ExpectedNodesScoreFunction(
diff --git a/tests/test_pretrained_model_interface.py b/tests/test_pretrained_model_interface.py
index 13056ddacd..51aa3d8d81 100644
--- a/tests/test_pretrained_model_interface.py
+++ b/tests/test_pretrained_model_interface.py
@@ -8,7 +8,7 @@
 
 from predicators import utils
 from predicators.pretrained_model_interface import GoogleGeminiVLM, \
-    LargeLanguageModel, OpenAILLM, VisionLanguageModel
+    LargeLanguageModel, OpenAILLM, OpenAIVLM, VisionLanguageModel
 
 
 class _DummyLLM(LargeLanguageModel):
@@ -127,13 +127,21 @@ def test_openai_llm():
     if "OPENAI_API_KEY" not in os.environ:  # pragma: no cover
         os.environ["OPENAI_API_KEY"] = "dummy API key"
     # Create an OpenAILLM with the curie model.
-    llm = OpenAILLM("text-curie-001")
-    assert llm.get_id() == "openai-text-curie-001"
+    llm = OpenAILLM("gpt-4o")
+    assert llm.get_id() == "openai-gpt-4o"
     # Uncomment this to test manually, but do NOT uncomment in master, because
     # each query costs money.
-    # completions = llm.sample_completions("Hi", 0.5, 123, num_completions=2)
+    # completions = llm.sample_completions("Hi",
+    #                                      None,
+    #                                      0.5,
+    #                                      123,
+    #                                      num_completions=2)
     # assert len(completions) == 2
-    # completions2 = llm.sample_completions("Hi", 0.5, 123, num_completions=2)
+    # completions2 = llm.sample_completions("Hi",
+    #                                       None,
+    #                                       0.5,
+    #                                       123,
+    #                                       num_completions=2)
     # assert completions == completions2
     # shutil.rmtree(cache_dir)
 
@@ -147,3 +155,37 @@ def test_gemini_vlm():
     # Create an OpenAILLM with the curie model.
     vlm = GoogleGeminiVLM("gemini-pro-vision")
     assert vlm.get_id() == "Google-gemini-pro-vision"
+
+
+def test_openai_vlm():
+    """Tests for GoogleGeminiVLM()."""
+    cache_dir = "_fake_llm_cache_dir"
+    utils.reset_config({"pretrained_model_prompt_cache_dir": cache_dir})
+    if "OPENAI_API_KEY" not in os.environ:  # pragma: no cover
+        os.environ["OPENAI_API_KEY"] = "dummy API key"
+    # Create an OpenAILLM with the curie model.
+    vlm = OpenAIVLM("gpt-4-turbo")
+    assert vlm.get_id() == "OpenAI-gpt-4-turbo"
+    dummy_img = Image.new('RGB', (100, 100))
+    vision_messages = vlm.prepare_vision_messages([dummy_img], "wakanda",
+                                                  "forever")
+    assert len(vision_messages) == 1
+    assert vision_messages[0]['content'][1]['type'] == 'image_url'
+    # NOTE: Uncomment below lines for actual test.
+    # images = [Image.open("tests/datasets/test_vlm_predicate_img.jpg")]
+    # prompt = """
+    #     Describe the object relationships between the objects and containers.
+    #     You can use following predicate-style descriptions:
+    #     Inside(object1, container)
+    #     Blocking(object1, object2)
+    #     On(object, surface)
+    #     """
+    # completions = vlm.sample_completions(prompt=prompt,
+    #                                      imgs=images,
+    #                                      temperature=0.5,
+    #                                      num_completions=3,
+    #                                      seed=0)
+    # assert len(completions) == 3
+    # for completion in completions:
+    #     assert "Inside" in completion
+    # shutil.rmtree(cache_dir)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 73aa0e4ce8..c169d8bb6d 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -17,14 +17,35 @@
 from predicators.ground_truth_models import _get_predicates_by_names, \
     get_gt_nsrts, get_gt_options
 from predicators.nsrt_learning.segmentation import segment_trajectory
+from predicators.pretrained_model_interface import VisionLanguageModel
 from predicators.settings import CFG
 from predicators.structs import NSRT, Action, DefaultState, DummyOption, \
     GroundAtom, LowLevelTrajectory, ParameterizedOption, Predicate, Segment, \
-    State, STRIPSOperator, Type, Variable
+    State, STRIPSOperator, Type, Variable, VLMPredicate
 from predicators.utils import GoalCountHeuristic, _PyperplanHeuristicWrapper, \
     _TaskPlanningHeuristic
 
 
+class _DummyVLM(VisionLanguageModel):
+
+    def get_id(self):
+        return "dummy"
+
+    def _sample_completions(self,
+                            prompt,
+                            imgs,
+                            temperature,
+                            seed,
+                            stop_token=None,
+                            num_completions=1):
+        del prompt, imgs, temperature, seed, stop_token  # unused.
+        completions = []
+        for _ in range(num_completions):
+            completion = "* is_fishy: True."
+            completions.append(completion)
+        return completions
+
+
 @pytest.mark.parametrize("max_groundings,exp_num_true,exp_num_false",
                          [(-1, 0, 0), (None, 1, 1)])
 def test_count_positives_for_ops(max_groundings, exp_num_true, exp_num_false):
@@ -1099,6 +1120,15 @@ def _classifier2(state, objects):
     }
     # Wrapping a predicate should destroy its classifier.
     assert not utils.abstract(state, {wrapped_pred1, wrapped_pred2})
+    # Now, test the case where we abstract using a VLM predicate.
+    utils.reset_config({"seed": 123})
+    vlm_pred = VLMPredicate("IsFishy", [], lambda s, o: NotImplementedError,
+                            lambda o: "is_fishy")
+    vlm_state = state.copy()
+    vlm_state.simulator_state = [np.zeros((30, 30, 3), dtype=np.uint8)]
+    vlm_atoms_set = utils.abstract(vlm_state, [vlm_pred], _DummyVLM())
+    assert len(vlm_atoms_set) == 1
+    assert "IsFishy" in str(vlm_atoms_set)
 
 
 def test_create_new_variables():
@@ -2499,6 +2529,20 @@ def test_save_video():
     os.rmdir(dirname)
 
 
+def test_save_images():
+    """Tests for save_images()."""
+    dirname = "_fake_tmp_images_dir"
+    prefix = "image_prefix"
+    utils.reset_config({"image_dir": dirname})
+    rng = np.random.default_rng(123)
+    video = [rng.integers(255, size=(3, 3), dtype=np.uint8) for _ in range(3)]
+    utils.save_images(prefix, video)
+    os.remove(os.path.join(dirname, prefix + "_image_0.png"))
+    os.remove(os.path.join(dirname, prefix + "_image_1.png"))
+    os.remove(os.path.join(dirname, prefix + "_image_2.png"))
+    os.rmdir(dirname)
+
+
 def test_get_config_path_str():
     """Tests for get_config_path_str()."""
     utils.reset_config({