Source code for pyba.core.agent.playwright_agent

import json
from types import SimpleNamespace
from typing import Dict, List, Union, Any

from pydantic import BaseModel

from pyba.core.agent.base_agent import BaseAgent
from pyba.core.agent.extraction_agent import ExtractionAgent
from pyba.utils.prompts import general_prompt, output_prompt
from pyba.utils.structure import PlaywrightResponse


[docs] class PlaywrightAgent(BaseAgent): """ Defines the playwright agent's actions Provides two endpoints: - `process_action`: for returning the right action on a page - `get_output`: for summarizing the chat and returning a string """ def __init__(self, engine) -> None: """ Args: `engine`: holds all the arguments from the user including the mode """ super().__init__(engine=engine) # Initialising the base params from BaseAgent self.action_agent, self.output_agent = self.llm_factory.get_agent() def _initialise_prompt( self, cleaned_dom: Dict[str, Union[List, str]], user_prompt: str, main_instruction: str, previous_action: str = None, fail_reason: str = None, action_status: bool = None, ): """ Method to initailise the main instruction for any agent Args: `cleaned_dom`: A dictionary containing nicely formatted DOM elements `user_prompt`: The instructions given by the user `main_instruction`: The prompt for the playwright agent `previous_action`: The previous action `fail_reason`: The reason for the failure of the previous action `action_status`: Boolean to decide if the previous action was a success or not TODO: Add `history` of ALL/SOME actions to give some context as to where we are headed # DEPRECATED - The fail_reason decides if the previous access was a success or not. For each run, a prompt containing the previous action, its status (success or failure) and a fail reason (if it failed) is provided. This helps the model reason better """ # Adding the user_prompt to the DOM to make it easier to format the prompt cleaned_dom["user_prompt"] = user_prompt cleaned_dom["previous_action"] = previous_action cleaned_dom["action_status"] = action_status cleaned_dom["fail_reason"] = fail_reason prompt = main_instruction.format(**cleaned_dom) return prompt def _call_model( self, agent: Any, prompt: str, agent_type: str, cleaned_dom: Dict = None, context_id: str = None, extractor=None, user_prompt: str = None, ) -> Any: """ Generic method to call the correct LLM provider and parse the response. Args: `agent`: The agent to use (action_agent or output_agent) `prompt`: The fully formatted prompt string `agent_type`: "action" or "output", to determine parsing logic `cleaned_dom`: A dictionary that holds the `actual_text` from which the data is to be extracted `context_id`: A unique identifier for this browser window (useful when multiple windows) `extractor`: The extraction agent for this call (passed in to avoid shared mutable state) `user_prompt`: The original user prompt for this call (passed in to avoid shared mutable state) Returns: The parsed response (SimpleNamespace for action, str for output) """ # If this guy gives me an output which says I need to extract the relevant data from this page, # Then I call the extraction agent here and extract information in a separate thread? Separate thread is easier, # I don't have to write my functions as async then if self.engine.provider == "openai": response = self.handle_openai_execution( agent=agent, prompt=prompt, context_id=context_id ) parsed_json = json.loads(response.choices[0].message.content) # Parse based on agent type if agent_type == "action": actions = SimpleNamespace(**parsed_json.get("actions")[0]) extract_info_flag = parsed_json.get("extract_info") if extract_info_flag: extractor.run_threaded_info_extraction( task=user_prompt, actual_text=cleaned_dom["actual_text"] ) return actions elif agent_type == "output": return str(parsed_json.get("output")) elif self.engine.provider == "vertexai": # VertexAI logic response = self.handle_vertexai_execution( agent=agent, prompt=prompt, context_id=context_id ) try: parsed_object = getattr( response, "output_parsed", getattr(response, "parsed", None) ) if not parsed_object: self.log.error("No parsed object found in VertexAI response.") return None # Parse based on agent type if agent_type == "action": if hasattr(parsed_object, "actions") and parsed_object.actions: actions = parsed_object.actions[0] extract_info_flag = parsed_object.extract_info if extract_info_flag: extractor.run_threaded_info_extraction( task=user_prompt, actual_text=cleaned_dom["actual_text"] ) return actions raise IndexError("No 'actions' found in VertexAI response.") elif agent_type == "output": if hasattr(parsed_object, "output") and parsed_object.output: return str(parsed_object.output) raise IndexError("No 'output' found in VertexAI response.") except Exception as e: if not response: self.log.error(f"Unable to parse the output from VertexAI response: {e}") # If we have a response which cannot be parsed, it MUST be a None value else: # Using gemini response = self.handle_gemini_execution( agent=agent, prompt=prompt, context_id=context_id ) parsed_object = agent["response_format"].model_validate_json(response.text) if agent_type == "action": if parsed_object.actions: actions = parsed_object.actions[0] extract_info_flag = parsed_object.extract_info if extract_info_flag: extractor.run_threaded_info_extraction( task=user_prompt, actual_text=cleaned_dom["actual_text"] ) return actions elif agent_type == "output": return str(parsed_object.output)
[docs] def process_action( self, cleaned_dom: Dict[str, Union[List, str]], user_prompt: str, previous_action: str = None, fail_reason: str = None, extraction_format: BaseModel = None, context_id: str = None, action_status: bool = None, ) -> PlaywrightResponse: """ Method to process the DOM and provide an actionable playwright response Args: `cleaned_dom`: Dictionary of the extracted items from the DOM - `hyperlinks`: List - `input_fields` (basically all fillable boxes): List - `clickable_fields`: List - `actual_text`: string `user_prompt`: The instructions given by the user `previous_action`: The previous executed action `fail_reason`: Holds the fail-reason should the previous task fail `extraction_format`: The extraction format for the task `context_id`: A unique identifier for this browser window (useful when multiple windows) `fail_reason`: The reason for failure of the previous action (None if not provided => Action passed) `action_status`: The success or the failure of an action output: A predefined pydantic model called `PlaywrightResponse` which defines our DSL """ prompt = self._initialise_prompt( cleaned_dom=cleaned_dom, user_prompt=user_prompt, main_instruction=general_prompt, previous_action=previous_action if previous_action else "", fail_reason=fail_reason if fail_reason else "", action_status=action_status if action_status else "", ) extractor = ExtractionAgent(engine=self.engine, extraction_format=extraction_format) return self._call_model( agent=self.action_agent, prompt=prompt, agent_type="action", cleaned_dom=cleaned_dom, context_id=context_id, extractor=extractor, user_prompt=user_prompt, )
[docs] def get_output( self, cleaned_dom: Dict[str, Union[List, str]], user_prompt: str, context_id: str = None ) -> str: """ Method to get the final output from the model if the user requested for one """ prompt = self._initialise_prompt( cleaned_dom=cleaned_dom, user_prompt=user_prompt, main_instruction=output_prompt ) return self._call_model( agent=self.output_agent, prompt=prompt, agent_type="output", context_id=context_id )