Skip to content

Commit

Permalink
optimize multiple select (#1703)
Browse files Browse the repository at this point in the history
  • Loading branch information
wintonzheng authored Feb 3, 2025
1 parent 36de8bd commit b4f2ec9
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 17 deletions.
2 changes: 1 addition & 1 deletion skyvern/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,7 @@ def __init__(self) -> None:
class NoIncrementalElementFoundForCustomSelection(SkyvernException):
def __init__(self, element_id: str) -> None:
super().__init__(
f"No incremental element found, maybe try an input action or taking the select action on other elements. element_id={element_id}"
f"No incremental element found, try it again later or try another element. element_id={element_id}"
)


Expand Down
19 changes: 14 additions & 5 deletions skyvern/forge/prompts/skyvern/confirm-multi-selection-finish.j2
Original file line number Diff line number Diff line change
@@ -1,13 +1,22 @@
Confirm if the user has finished the multi-level selection based on the screenshot, user details, the HTML elements and select history provided in the list.
Confirm if the user has finished the mini goal in the current opened dropdown selection based on the screenshot, user details, the HTML elements and select history provided in the list.

NOTE:
- Only consider the mini goal is achieved when there is(are) one(several) valid options selected in the dropdown.
- Sometimes it's a multi-level selection dropdown, you need to select multiple times to pick a valid option(sub-option).

Reply in JSON format with the following keys:
{
"page_info": str, // Think step by step. Describe the page information you parsed from the HTML elements. Your action should be based on the current page information.
"think": str, // Think step by step. Describe how you think the user has finished the multi-level selection.
"confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence
"is_finished": bool, // True if the user has finished the multi-level selection, False otherwise.
"page_info": str, // Think step by step. Describe the page information you parsed from the HTML elements and the screenshot. Your decision should be based on the current page information.
"think": str, // Think step by step. Describe how you think the user has finished the mini goal in the current opened dropdown selection.
"is_multiple_selection": bool, // True if it's a multi-level selection, otheriwse False.
"is_mini_goal_finished": bool, // True if the user has finished the mini goal in the current opened dropdown selection, False otherwise.
}

Mini Goal:
```
Select an option for "{{ mini_goal }}"
```

User goal:
```
{{ navigation_goal }}
Expand Down
3 changes: 2 additions & 1 deletion skyvern/webeye/actions/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,15 @@ def __repr__(self) -> str:


class InputOrSelectContext(BaseModel):
intention: str | None = None
field: str | None = None
is_required: bool | None = None
is_search_bar: bool | None = None # don't trigger custom-selection logic when it's a search bar
is_location_input: bool | None = None # address input usually requires auto completion
is_date_related: bool | None = None # date picker mini agent requires some special logic

def __repr__(self) -> str:
return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input})"
return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input}, intention={self.intention})"


class Action(BaseModel):
Expand Down
39 changes: 29 additions & 10 deletions skyvern/webeye/actions/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,7 @@ async def handle_input_text_action(
reasoning=action.reasoning,
element_id=skyvern_element.get_id(),
option=SelectOption(label=text),
intention=action.intention,
)
if skyvern_element.get_selectable():
LOG.info(
Expand Down Expand Up @@ -761,6 +762,7 @@ async def handle_input_text_action(
json_response = await app.SECONDARY_LLM_API_HANDLER(
prompt=prompt, step=step, prompt_name="parse-input-or-select-context"
)
json_response["intention"] = action.intention
input_or_select_context = InputOrSelectContext.model_validate(json_response)
LOG.info(
"Parsed input/select context",
Expand Down Expand Up @@ -999,6 +1001,7 @@ async def handle_select_option_action(
reasoning=action.reasoning,
element_id=selectable_child.get_id(),
option=action.option,
intention=action.intention,
)
action = select_action
skyvern_element = selectable_child
Expand Down Expand Up @@ -1045,6 +1048,7 @@ async def handle_select_option_action(
reasoning=action.reasoning,
element_id=blocking_element.get_id(),
option=action.option,
intention=action.intention,
)
action = select_action
skyvern_element = blocking_element
Expand Down Expand Up @@ -1666,7 +1670,7 @@ async def choose_auto_completion_dropdown(
auto_completion_confirm_prompt = prompt_engine.load_prompt(
"auto-completion-choose-option",
is_search=context.is_search_bar,
field_information=context.field,
field_information=context.field if not context.intention else context.intention,
filled_value=text,
navigation_goal=task.navigation_goal,
navigation_payload_str=json.dumps(task.navigation_payload),
Expand Down Expand Up @@ -1815,10 +1819,16 @@ async def input_or_auto_complete_input(
tried_values.append(current_value)
whole_new_elements.extend(result.incremental_elements)

field_information = (
input_or_select_context.field
if not input_or_select_context.intention
else input_or_select_context.intention
)

prompt = prompt_engine.load_prompt(
"auto-completion-potential-answers",
potential_value_count=AUTO_COMPLETION_POTENTIAL_VALUES_COUNT,
field_information=input_or_select_context.field,
field_information=field_information,
current_value=current_value,
navigation_goal=task.navigation_goal,
navigation_payload_str=json.dumps(task.navigation_payload),
Expand Down Expand Up @@ -1879,7 +1889,7 @@ async def input_or_auto_complete_input(
cleaned_new_elements = remove_duplicated_HTML_element(whole_new_elements)
prompt = prompt_engine.load_prompt(
"auto-completion-tweak-value",
field_information=input_or_select_context.field,
field_information=field_information,
current_value=current_value,
navigation_goal=task.navigation_goal,
navigation_payload_str=json.dumps(task.navigation_payload),
Expand Down Expand Up @@ -1940,6 +1950,7 @@ async def sequentially_select_from_dropdown(
json_response = await app.SECONDARY_LLM_API_HANDLER(
prompt=prompt, step=step, prompt_name="parse-input-or-select-context"
)
json_response["intention"] = action.intention
input_or_select_context = InputOrSelectContext.model_validate(json_response)
LOG.info(
"Parsed input/select context",
Expand Down Expand Up @@ -2038,21 +2049,26 @@ async def sequentially_select_from_dropdown(

# it's for typing. it's been verified in `single_select_result.is_done()`
assert single_select_result.dropdown_menu is not None
screenshot = await single_select_result.dropdown_menu.get_locator().screenshot(
timeout=settings.BROWSER_SCREENSHOT_TIMEOUT_MS
screenshot = await page.screenshot(timeout=settings.BROWSER_SCREENSHOT_TIMEOUT_MS)
mini_goal = (
input_or_select_context.field
if not input_or_select_context.intention
else input_or_select_context.intention
)
prompt = prompt_engine.load_prompt(
"confirm-multi-selection-finish",
mini_goal=mini_goal,
navigation_goal=task.navigation_goal,
navigation_payload_str=json.dumps(task.navigation_payload),
elements="".join(json_to_html(element) for element in secondary_increment_element),
select_history=json.dumps(build_sequential_select_history(select_history)),
local_datetime=datetime.now(ensure_context().tz_info).isoformat(),
)
json_response = await app.SECONDARY_LLM_API_HANDLER(
json_response = await app.LLM_API_HANDLER(
prompt=prompt, screenshots=[screenshot], step=step, prompt_name="confirm-multi-selection-finish"
)
if json_response.get("is_finished", False):
if json_response.get("is_mini_goal_finished", False):
LOG.info("The user has finished the selection for the current opened dropdown", step_id=step.step_id)
return single_select_result.action_result, values[-1] if len(values) > 0 else None

return select_history[-1].action_result if len(select_history) > 0 else None, values[-1] if len(
Expand Down Expand Up @@ -2138,7 +2154,7 @@ async def select_from_dropdown(
prompt = prompt_engine.load_prompt(
"custom-select",
is_date_related=context.is_date_related,
field_information=context.field,
field_information=context.field if not context.intention else context.intention,
required_field=context.is_required,
target_value="" if force_select else target_value,
navigation_goal=task.navigation_goal,
Expand Down Expand Up @@ -2588,6 +2604,7 @@ async def normal_select(
json_response = await app.SECONDARY_LLM_API_HANDLER(
prompt=prompt, step=step, prompt_name="parse-input-or-select-context"
)
json_response["intention"] = action.intention
input_or_select_context = InputOrSelectContext.model_validate(json_response)
LOG.info(
"Parsed input/select context",
Expand All @@ -2597,10 +2614,12 @@ async def normal_select(
)

options_html = skyvern_element.build_HTML()

field_information = (
input_or_select_context.field if not input_or_select_context.intention else input_or_select_context.intention
)
prompt = prompt_engine.load_prompt(
"normal-select",
field_information=input_or_select_context.field,
field_information=field_information,
required_field=input_or_select_context.is_required,
navigation_goal=task.navigation_goal,
navigation_payload_str=json.dumps(task.navigation_payload),
Expand Down

0 comments on commit b4f2ec9

Please sign in to comment.