optimize multiple select (#1703)

Skyvern-AI · Feb 3, 2025 · b4f2ec9 · b4f2ec9
1 parent 36de8bd
commit b4f2ec9
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 17 deletions.
diff --git a/skyvern/exceptions.py b/skyvern/exceptions.py
@@ -451,7 +451,7 @@ def __init__(self) -> None:
 class NoIncrementalElementFoundForCustomSelection(SkyvernException):
     def __init__(self, element_id: str) -> None:
         super().__init__(
-            f"No incremental element found, maybe try an input action or taking the select action on other elements. element_id={element_id}"
+            f"No incremental element found, try it again later or try another element. element_id={element_id}"
         )
 
 

diff --git a/skyvern/forge/prompts/skyvern/confirm-multi-selection-finish.j2 b/skyvern/forge/prompts/skyvern/confirm-multi-selection-finish.j2
@@ -1,13 +1,22 @@
-Confirm if the user has finished the multi-level selection based on the screenshot, user details, the HTML elements and select history provided in the list.
+Confirm if the user has finished the mini goal in the current opened dropdown selection based on the screenshot, user details, the HTML elements and select history provided in the list.
+
+NOTE:
+- Only consider the mini goal is achieved when there is(are) one(several) valid options selected in the dropdown.
+- Sometimes it's a multi-level selection dropdown, you need to select multiple times to pick a valid option(sub-option).
 
 Reply in JSON format with the following keys:
 {
-    "page_info": str, // Think step by step. Describe the page information you parsed from the HTML elements. Your action should be based on the current page information.
-    "think": str, // Think step by step. Describe how you think the user has finished the multi-level selection.
-    "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence
-    "is_finished": bool, // True if the user has finished the multi-level selection, False otherwise.
+    "page_info": str, // Think step by step. Describe the page information you parsed from the HTML elements and the screenshot. Your decision should be based on the current page information.
+    "think": str, // Think step by step. Describe how you think the user has finished the mini goal in the current opened dropdown selection.
+    "is_multiple_selection": bool, // True if it's a multi-level selection, otheriwse False.
+    "is_mini_goal_finished": bool, // True if the user has finished the mini goal in the current opened dropdown selection, False otherwise.
 }
 
+Mini Goal:
+```
+Select an option for "{{ mini_goal }}"
+```
+
 User goal:
 ```
 {{ navigation_goal }}

diff --git a/skyvern/webeye/actions/actions.py b/skyvern/webeye/actions/actions.py
@@ -75,14 +75,15 @@ def __repr__(self) -> str:
 
 
 class InputOrSelectContext(BaseModel):
+    intention: str | None = None
     field: str | None = None
     is_required: bool | None = None
     is_search_bar: bool | None = None  # don't trigger custom-selection logic when it's a search bar
     is_location_input: bool | None = None  # address input usually requires auto completion
     is_date_related: bool | None = None  # date picker mini agent requires some special logic
 
     def __repr__(self) -> str:
-        return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input})"
+        return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input}, intention={self.intention})"
 
 
 class Action(BaseModel):

diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py
@@ -582,6 +582,7 @@ async def handle_input_text_action(
         reasoning=action.reasoning,
         element_id=skyvern_element.get_id(),
         option=SelectOption(label=text),
+        intention=action.intention,
     )
     if skyvern_element.get_selectable():
         LOG.info(
@@ -761,6 +762,7 @@ async def handle_input_text_action(
             json_response = await app.SECONDARY_LLM_API_HANDLER(
                 prompt=prompt, step=step, prompt_name="parse-input-or-select-context"
             )
+            json_response["intention"] = action.intention
             input_or_select_context = InputOrSelectContext.model_validate(json_response)
             LOG.info(
                 "Parsed input/select context",
@@ -999,6 +1001,7 @@ async def handle_select_option_action(
                 reasoning=action.reasoning,
                 element_id=selectable_child.get_id(),
                 option=action.option,
+                intention=action.intention,
             )
             action = select_action
             skyvern_element = selectable_child
@@ -1045,6 +1048,7 @@ async def handle_select_option_action(
             reasoning=action.reasoning,
             element_id=blocking_element.get_id(),
             option=action.option,
+            intention=action.intention,
         )
         action = select_action
         skyvern_element = blocking_element
@@ -1666,7 +1670,7 @@ async def choose_auto_completion_dropdown(
         auto_completion_confirm_prompt = prompt_engine.load_prompt(
             "auto-completion-choose-option",
             is_search=context.is_search_bar,
-            field_information=context.field,
+            field_information=context.field if not context.intention else context.intention,
             filled_value=text,
             navigation_goal=task.navigation_goal,
             navigation_payload_str=json.dumps(task.navigation_payload),
@@ -1815,10 +1819,16 @@ async def input_or_auto_complete_input(
         tried_values.append(current_value)
         whole_new_elements.extend(result.incremental_elements)
 
+        field_information = (
+            input_or_select_context.field
+            if not input_or_select_context.intention
+            else input_or_select_context.intention
+        )
+
         prompt = prompt_engine.load_prompt(
             "auto-completion-potential-answers",
             potential_value_count=AUTO_COMPLETION_POTENTIAL_VALUES_COUNT,
-            field_information=input_or_select_context.field,
+            field_information=field_information,
             current_value=current_value,
             navigation_goal=task.navigation_goal,
             navigation_payload_str=json.dumps(task.navigation_payload),
@@ -1879,7 +1889,7 @@ async def input_or_auto_complete_input(
             cleaned_new_elements = remove_duplicated_HTML_element(whole_new_elements)
             prompt = prompt_engine.load_prompt(
                 "auto-completion-tweak-value",
-                field_information=input_or_select_context.field,
+                field_information=field_information,
                 current_value=current_value,
                 navigation_goal=task.navigation_goal,
                 navigation_payload_str=json.dumps(task.navigation_payload),
@@ -1940,6 +1950,7 @@ async def sequentially_select_from_dropdown(
     json_response = await app.SECONDARY_LLM_API_HANDLER(
         prompt=prompt, step=step, prompt_name="parse-input-or-select-context"
     )
+    json_response["intention"] = action.intention
     input_or_select_context = InputOrSelectContext.model_validate(json_response)
     LOG.info(
         "Parsed input/select context",
@@ -2038,21 +2049,26 @@ async def sequentially_select_from_dropdown(
 
         # it's for typing. it's been verified in `single_select_result.is_done()`
         assert single_select_result.dropdown_menu is not None
-        screenshot = await single_select_result.dropdown_menu.get_locator().screenshot(
-            timeout=settings.BROWSER_SCREENSHOT_TIMEOUT_MS
+        screenshot = await page.screenshot(timeout=settings.BROWSER_SCREENSHOT_TIMEOUT_MS)
+        mini_goal = (
+            input_or_select_context.field
+            if not input_or_select_context.intention
+            else input_or_select_context.intention
         )
         prompt = prompt_engine.load_prompt(
             "confirm-multi-selection-finish",
+            mini_goal=mini_goal,
             navigation_goal=task.navigation_goal,
             navigation_payload_str=json.dumps(task.navigation_payload),
             elements="".join(json_to_html(element) for element in secondary_increment_element),
             select_history=json.dumps(build_sequential_select_history(select_history)),
             local_datetime=datetime.now(ensure_context().tz_info).isoformat(),
         )
-        json_response = await app.SECONDARY_LLM_API_HANDLER(
+        json_response = await app.LLM_API_HANDLER(
             prompt=prompt, screenshots=[screenshot], step=step, prompt_name="confirm-multi-selection-finish"
         )
-        if json_response.get("is_finished", False):
+        if json_response.get("is_mini_goal_finished", False):
+            LOG.info("The user has finished the selection for the current opened dropdown", step_id=step.step_id)
             return single_select_result.action_result, values[-1] if len(values) > 0 else None
 
     return select_history[-1].action_result if len(select_history) > 0 else None, values[-1] if len(
@@ -2138,7 +2154,7 @@ async def select_from_dropdown(
     prompt = prompt_engine.load_prompt(
         "custom-select",
         is_date_related=context.is_date_related,
-        field_information=context.field,
+        field_information=context.field if not context.intention else context.intention,
         required_field=context.is_required,
         target_value="" if force_select else target_value,
         navigation_goal=task.navigation_goal,
@@ -2588,6 +2604,7 @@ async def normal_select(
     json_response = await app.SECONDARY_LLM_API_HANDLER(
         prompt=prompt, step=step, prompt_name="parse-input-or-select-context"
     )
+    json_response["intention"] = action.intention
     input_or_select_context = InputOrSelectContext.model_validate(json_response)
     LOG.info(
         "Parsed input/select context",
@@ -2597,10 +2614,12 @@ async def normal_select(
     )
 
     options_html = skyvern_element.build_HTML()
-
+    field_information = (
+        input_or_select_context.field if not input_or_select_context.intention else input_or_select_context.intention
+    )
     prompt = prompt_engine.load_prompt(
         "normal-select",
-        field_information=input_or_select_context.field,
+        field_information=field_information,
         required_field=input_or_select_context.is_required,
         navigation_goal=task.navigation_goal,
         navigation_payload_str=json.dumps(task.navigation_payload),