Source code for pyba.core.agent.playwright_agent

import json
from types import SimpleNamespace
from typing import Dict, List, Union, Any

from pydantic import BaseModel

from pyba.core.agent.base_agent import BaseAgent
from pyba.core.agent.extraction_agent import ExtractionAgent
from pyba.utils.exceptions import LLMResponseParseError
from pyba.utils.prompts import general_prompt, output_prompt
from pyba.utils.structure import PlaywrightResponse


[docs] class PlaywrightAgent(BaseAgent): """ Defines the playwright agent's actions Provides two endpoints: - process_action: for returning the right action on a page - get_output: for summarizing the chat and returning a string """ def __init__(self, engine) -> None: """ Args: engine: holds all the arguments from the user including the mode """ super().__init__(engine=engine) # Initialising the base params from BaseAgent self.action_agent, self.output_agent = self.llm_factory.get_agent() def _initialise_prompt( self, cleaned_dom: Dict[str, Union[List, str]], user_prompt: str, main_instruction: str, action_history: str = None, fail_reason: str = None, action_status: bool = None, ): """ Formats the prompt template by injecting DOM data and action context. Args: cleaned_dom: Dictionary of extracted DOM elements. user_prompt: The user's task instruction. main_instruction: The prompt template to format. action_history: The full natural language history of actions taken so far. fail_reason: Reason the previous action failed, if applicable. action_status: Whether the previous action succeeded. """ cleaned_dom["user_prompt"] = user_prompt cleaned_dom["action_history"] = action_history cleaned_dom["action_status"] = action_status cleaned_dom["fail_reason"] = fail_reason prompt = main_instruction.format(**cleaned_dom) return prompt def _call_model( self, agent: Any, prompt: str, agent_type: str, cleaned_dom: Dict = None, context_id: str = None, extractor=None, user_prompt: str = None, ) -> Any: """ Generic method to call the correct LLM provider and parse the response. Args: agent: The agent to use (action_agent or output_agent) prompt: The fully formatted prompt string agent_type: "action" or "output", to determine parsing logic cleaned_dom: A dictionary that holds the `actual_text` from which the data is to be extracted context_id: A unique identifier for this browser window (useful when multiple windows) extractor: The extraction agent for this call (passed in to avoid shared mutable state) user_prompt: The original user prompt for this call (passed in to avoid shared mutable state) Returns: The parsed response (SimpleNamespace for action, str for output) """ if self.engine.provider == "openai": response = self.handle_openai_execution( agent=agent, prompt=prompt, context_id=context_id ) try: parsed_json = json.loads(response.choices[0].message.content) except (json.JSONDecodeError, IndexError, AttributeError) as e: raise LLMResponseParseError( "OpenAI returned a response that could not be parsed as JSON. " "The model may have produced malformed output.", cause=e, ) if agent_type == "action": actions_list = parsed_json.get("actions") if not actions_list: raise LLMResponseParseError( "OpenAI response contained no 'actions' field. " "The model did not produce a valid next action.", ) actions = SimpleNamespace(**actions_list[0]) extract_info_flag = parsed_json.get("extract_info") if extract_info_flag: extractor.run_threaded_info_extraction( task=user_prompt, actual_text=cleaned_dom["actual_text"] ) return actions elif agent_type == "output": return str(parsed_json.get("output")) elif self.engine.provider == "vertexai": response = self.handle_vertexai_execution( agent=agent, prompt=prompt, context_id=context_id ) try: parsed_object = getattr( response, "output_parsed", getattr(response, "parsed", None) ) if not parsed_object: raise LLMResponseParseError( "VertexAI returned a response with no parsed object. " "The model may have returned an empty or malformed response.", ) if agent_type == "action": if hasattr(parsed_object, "actions") and parsed_object.actions: actions = parsed_object.actions[0] extract_info_flag = parsed_object.extract_info if extract_info_flag: extractor.run_threaded_info_extraction( task=user_prompt, actual_text=cleaned_dom["actual_text"] ) return actions raise LLMResponseParseError( "VertexAI response contained no 'actions'. " "The model did not produce a valid next action.", ) elif agent_type == "output": if hasattr(parsed_object, "output") and parsed_object.output: return str(parsed_object.output) raise LLMResponseParseError( "VertexAI response contained no 'output'. " "The model did not produce a final summary.", ) except LLMResponseParseError: raise except Exception as e: raise LLMResponseParseError( f"Failed to parse VertexAI response: {type(e).__name__}: {e}", cause=e, ) else: response = self.handle_gemini_execution( agent=agent, prompt=prompt, context_id=context_id ) try: parsed_object = agent["response_format"].model_validate_json(response.text) except Exception as e: raise LLMResponseParseError( "Gemini returned a response that could not be parsed. " "The model may have produced malformed JSON output.", cause=e, ) if agent_type == "action": if parsed_object.actions: actions = parsed_object.actions[0] extract_info_flag = parsed_object.extract_info if extract_info_flag: extractor.run_threaded_info_extraction( task=user_prompt, actual_text=cleaned_dom["actual_text"] ) return actions raise LLMResponseParseError( "Gemini response contained no 'actions'. " "The model did not produce a valid next action.", ) elif agent_type == "output": return str(parsed_object.output)
[docs] def process_action( self, cleaned_dom: Dict[str, Union[List, str]], user_prompt: str, action_history: str = None, fail_reason: str = None, extraction_format: BaseModel = None, context_id: str = None, action_status: bool = None, ) -> PlaywrightResponse: """ Processes the current DOM and returns the next PlaywrightAction to execute. Args: cleaned_dom: Dictionary of extracted DOM elements (hyperlinks, input_fields, clickable_fields, actual_text). user_prompt: The user's task instruction. action_history: The full natural language history of actions taken so far. fail_reason: Reason the previous action failed, if applicable. extraction_format: Pydantic model defining the extraction output schema. context_id: Unique identifier for this browser window (used in BFS mode). action_status: Whether the previous action succeeded. Returns: A PlaywrightAction to execute next, or None if the task is complete. """ prompt = self._initialise_prompt( cleaned_dom=cleaned_dom, user_prompt=user_prompt, main_instruction=general_prompt[self.engine.provider], action_history=action_history if action_history else "", fail_reason=fail_reason if fail_reason else "", action_status=action_status if action_status else "", ) extractor = ExtractionAgent(engine=self.engine, extraction_format=extraction_format) return self._call_model( agent=self.action_agent, prompt=prompt, agent_type="action", cleaned_dom=cleaned_dom, context_id=context_id, extractor=extractor, user_prompt=user_prompt, )
[docs] def get_output( self, cleaned_dom: Dict[str, Union[List, str]], user_prompt: str, context_id: str = None ) -> str: """ Gets the final text output from the model based on the current page state. """ prompt = self._initialise_prompt( cleaned_dom=cleaned_dom, user_prompt=user_prompt, main_instruction=output_prompt ) return self._call_model( agent=self.output_agent, prompt=prompt, agent_type="output", context_id=context_id )