diff --git a/skyvern/exceptions.py b/skyvern/exceptions.py index 389e949ffc..2d10234ca0 100644 --- a/skyvern/exceptions.py +++ b/skyvern/exceptions.py @@ -451,7 +451,7 @@ def __init__(self) -> None: class NoIncrementalElementFoundForCustomSelection(SkyvernException): def __init__(self, element_id: str) -> None: super().__init__( - f"No incremental element found, maybe try an input action or taking the select action on other elements. element_id={element_id}" + f"No incremental element found, try it again later or try another element. element_id={element_id}" ) diff --git a/skyvern/forge/prompts/skyvern/confirm-multi-selection-finish.j2 b/skyvern/forge/prompts/skyvern/confirm-multi-selection-finish.j2 index 89d9d21fb0..db2c8f0def 100644 --- a/skyvern/forge/prompts/skyvern/confirm-multi-selection-finish.j2 +++ b/skyvern/forge/prompts/skyvern/confirm-multi-selection-finish.j2 @@ -1,13 +1,22 @@ -Confirm if the user has finished the multi-level selection based on the screenshot, user details, the HTML elements and select history provided in the list. +Confirm if the user has finished the mini goal in the current opened dropdown selection based on the screenshot, user details, the HTML elements and select history provided in the list. + +NOTE: +- Only consider the mini goal is achieved when there is(are) one(several) valid options selected in the dropdown. +- Sometimes it's a multi-level selection dropdown, you need to select multiple times to pick a valid option(sub-option). Reply in JSON format with the following keys: { - "page_info": str, // Think step by step. Describe the page information you parsed from the HTML elements. Your action should be based on the current page information. - "think": str, // Think step by step. Describe how you think the user has finished the multi-level selection. - "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence - "is_finished": bool, // True if the user has finished the multi-level selection, False otherwise. + "page_info": str, // Think step by step. Describe the page information you parsed from the HTML elements and the screenshot. Your decision should be based on the current page information. + "think": str, // Think step by step. Describe how you think the user has finished the mini goal in the current opened dropdown selection. + "is_multiple_selection": bool, // True if it's a multi-level selection, otheriwse False. + "is_mini_goal_finished": bool, // True if the user has finished the mini goal in the current opened dropdown selection, False otherwise. } +Mini Goal: +``` +Select an option for "{{ mini_goal }}" +``` + User goal: ``` {{ navigation_goal }} diff --git a/skyvern/webeye/actions/actions.py b/skyvern/webeye/actions/actions.py index 790da6daf2..bf6be4d8cf 100644 --- a/skyvern/webeye/actions/actions.py +++ b/skyvern/webeye/actions/actions.py @@ -75,6 +75,7 @@ def __repr__(self) -> str: class InputOrSelectContext(BaseModel): + intention: str | None = None field: str | None = None is_required: bool | None = None is_search_bar: bool | None = None # don't trigger custom-selection logic when it's a search bar @@ -82,7 +83,7 @@ class InputOrSelectContext(BaseModel): is_date_related: bool | None = None # date picker mini agent requires some special logic def __repr__(self) -> str: - return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input})" + return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input}, intention={self.intention})" class Action(BaseModel): diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index 3b3dd46ff7..0725a02c8b 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -582,6 +582,7 @@ async def handle_input_text_action( reasoning=action.reasoning, element_id=skyvern_element.get_id(), option=SelectOption(label=text), + intention=action.intention, ) if skyvern_element.get_selectable(): LOG.info( @@ -761,6 +762,7 @@ async def handle_input_text_action( json_response = await app.SECONDARY_LLM_API_HANDLER( prompt=prompt, step=step, prompt_name="parse-input-or-select-context" ) + json_response["intention"] = action.intention input_or_select_context = InputOrSelectContext.model_validate(json_response) LOG.info( "Parsed input/select context", @@ -999,6 +1001,7 @@ async def handle_select_option_action( reasoning=action.reasoning, element_id=selectable_child.get_id(), option=action.option, + intention=action.intention, ) action = select_action skyvern_element = selectable_child @@ -1045,6 +1048,7 @@ async def handle_select_option_action( reasoning=action.reasoning, element_id=blocking_element.get_id(), option=action.option, + intention=action.intention, ) action = select_action skyvern_element = blocking_element @@ -1666,7 +1670,7 @@ async def choose_auto_completion_dropdown( auto_completion_confirm_prompt = prompt_engine.load_prompt( "auto-completion-choose-option", is_search=context.is_search_bar, - field_information=context.field, + field_information=context.field if not context.intention else context.intention, filled_value=text, navigation_goal=task.navigation_goal, navigation_payload_str=json.dumps(task.navigation_payload), @@ -1815,10 +1819,16 @@ async def input_or_auto_complete_input( tried_values.append(current_value) whole_new_elements.extend(result.incremental_elements) + field_information = ( + input_or_select_context.field + if not input_or_select_context.intention + else input_or_select_context.intention + ) + prompt = prompt_engine.load_prompt( "auto-completion-potential-answers", potential_value_count=AUTO_COMPLETION_POTENTIAL_VALUES_COUNT, - field_information=input_or_select_context.field, + field_information=field_information, current_value=current_value, navigation_goal=task.navigation_goal, navigation_payload_str=json.dumps(task.navigation_payload), @@ -1879,7 +1889,7 @@ async def input_or_auto_complete_input( cleaned_new_elements = remove_duplicated_HTML_element(whole_new_elements) prompt = prompt_engine.load_prompt( "auto-completion-tweak-value", - field_information=input_or_select_context.field, + field_information=field_information, current_value=current_value, navigation_goal=task.navigation_goal, navigation_payload_str=json.dumps(task.navigation_payload), @@ -1940,6 +1950,7 @@ async def sequentially_select_from_dropdown( json_response = await app.SECONDARY_LLM_API_HANDLER( prompt=prompt, step=step, prompt_name="parse-input-or-select-context" ) + json_response["intention"] = action.intention input_or_select_context = InputOrSelectContext.model_validate(json_response) LOG.info( "Parsed input/select context", @@ -2038,21 +2049,26 @@ async def sequentially_select_from_dropdown( # it's for typing. it's been verified in `single_select_result.is_done()` assert single_select_result.dropdown_menu is not None - screenshot = await single_select_result.dropdown_menu.get_locator().screenshot( - timeout=settings.BROWSER_SCREENSHOT_TIMEOUT_MS + screenshot = await page.screenshot(timeout=settings.BROWSER_SCREENSHOT_TIMEOUT_MS) + mini_goal = ( + input_or_select_context.field + if not input_or_select_context.intention + else input_or_select_context.intention ) prompt = prompt_engine.load_prompt( "confirm-multi-selection-finish", + mini_goal=mini_goal, navigation_goal=task.navigation_goal, navigation_payload_str=json.dumps(task.navigation_payload), elements="".join(json_to_html(element) for element in secondary_increment_element), select_history=json.dumps(build_sequential_select_history(select_history)), local_datetime=datetime.now(ensure_context().tz_info).isoformat(), ) - json_response = await app.SECONDARY_LLM_API_HANDLER( + json_response = await app.LLM_API_HANDLER( prompt=prompt, screenshots=[screenshot], step=step, prompt_name="confirm-multi-selection-finish" ) - if json_response.get("is_finished", False): + if json_response.get("is_mini_goal_finished", False): + LOG.info("The user has finished the selection for the current opened dropdown", step_id=step.step_id) return single_select_result.action_result, values[-1] if len(values) > 0 else None return select_history[-1].action_result if len(select_history) > 0 else None, values[-1] if len( @@ -2138,7 +2154,7 @@ async def select_from_dropdown( prompt = prompt_engine.load_prompt( "custom-select", is_date_related=context.is_date_related, - field_information=context.field, + field_information=context.field if not context.intention else context.intention, required_field=context.is_required, target_value="" if force_select else target_value, navigation_goal=task.navigation_goal, @@ -2588,6 +2604,7 @@ async def normal_select( json_response = await app.SECONDARY_LLM_API_HANDLER( prompt=prompt, step=step, prompt_name="parse-input-or-select-context" ) + json_response["intention"] = action.intention input_or_select_context = InputOrSelectContext.model_validate(json_response) LOG.info( "Parsed input/select context", @@ -2597,10 +2614,12 @@ async def normal_select( ) options_html = skyvern_element.build_HTML() - + field_information = ( + input_or_select_context.field if not input_or_select_context.intention else input_or_select_context.intention + ) prompt = prompt_engine.load_prompt( "normal-select", - field_information=input_or_select_context.field, + field_information=field_information, required_field=input_or_select_context.is_required, navigation_goal=task.navigation_goal, navigation_payload_str=json.dumps(task.navigation_payload),