import json
from types import SimpleNamespace
from typing import Dict, List, Union, Any
from pydantic import BaseModel
from pyba.core.agent.base_agent import BaseAgent
from pyba.core.agent.extraction_agent import ExtractionAgent
from pyba.utils.exceptions import LLMResponseParseError
from pyba.utils.prompts import general_prompt, output_prompt
from pyba.utils.structure import PlaywrightResponse
[docs]
class PlaywrightAgent(BaseAgent):
"""
Defines the playwright agent's actions
Provides two endpoints:
- process_action: for returning the right action on a page
- get_output: for summarizing the chat and returning a string
"""
def __init__(self, engine) -> None:
"""
Args:
engine: holds all the arguments from the user including the mode
"""
super().__init__(engine=engine) # Initialising the base params from BaseAgent
self.action_agent, self.output_agent = self.llm_factory.get_agent()
def _initialise_prompt(
self,
cleaned_dom: Dict[str, Union[List, str]],
user_prompt: str,
main_instruction: str,
action_history: str = None,
fail_reason: str = None,
action_status: bool = None,
):
"""
Formats the prompt template by injecting DOM data and action context.
Args:
cleaned_dom: Dictionary of extracted DOM elements.
user_prompt: The user's task instruction.
main_instruction: The prompt template to format.
action_history: The full natural language history of actions taken so far.
fail_reason: Reason the previous action failed, if applicable.
action_status: Whether the previous action succeeded.
"""
cleaned_dom["user_prompt"] = user_prompt
cleaned_dom["action_history"] = action_history
cleaned_dom["action_status"] = action_status
cleaned_dom["fail_reason"] = fail_reason
prompt = main_instruction.format(**cleaned_dom)
return prompt
def _call_model(
self,
agent: Any,
prompt: str,
agent_type: str,
cleaned_dom: Dict = None,
context_id: str = None,
extractor=None,
user_prompt: str = None,
) -> Any:
"""
Generic method to call the correct LLM provider and parse the response.
Args:
agent: The agent to use (action_agent or output_agent)
prompt: The fully formatted prompt string
agent_type: "action" or "output", to determine parsing logic
cleaned_dom: A dictionary that holds the `actual_text` from which the data is to be extracted
context_id: A unique identifier for this browser window (useful when multiple windows)
extractor: The extraction agent for this call (passed in to avoid shared mutable state)
user_prompt: The original user prompt for this call (passed in to avoid shared mutable state)
Returns:
The parsed response (SimpleNamespace for action, str for output)
"""
if self.engine.provider == "openai":
response = self.handle_openai_execution(
agent=agent, prompt=prompt, context_id=context_id
)
try:
parsed_json = json.loads(response.choices[0].message.content)
except (json.JSONDecodeError, IndexError, AttributeError) as e:
raise LLMResponseParseError(
"OpenAI returned a response that could not be parsed as JSON. "
"The model may have produced malformed output.",
cause=e,
)
if agent_type == "action":
actions_list = parsed_json.get("actions")
if not actions_list:
raise LLMResponseParseError(
"OpenAI response contained no 'actions' field. "
"The model did not produce a valid next action.",
)
actions = SimpleNamespace(**actions_list[0])
extract_info_flag = parsed_json.get("extract_info")
if extract_info_flag:
extractor.run_threaded_info_extraction(
task=user_prompt, actual_text=cleaned_dom["actual_text"]
)
return actions
elif agent_type == "output":
return str(parsed_json.get("output"))
elif self.engine.provider == "vertexai":
response = self.handle_vertexai_execution(
agent=agent, prompt=prompt, context_id=context_id
)
try:
parsed_object = getattr(
response, "output_parsed", getattr(response, "parsed", None)
)
if not parsed_object:
raise LLMResponseParseError(
"VertexAI returned a response with no parsed object. "
"The model may have returned an empty or malformed response.",
)
if agent_type == "action":
if hasattr(parsed_object, "actions") and parsed_object.actions:
actions = parsed_object.actions[0]
extract_info_flag = parsed_object.extract_info
if extract_info_flag:
extractor.run_threaded_info_extraction(
task=user_prompt, actual_text=cleaned_dom["actual_text"]
)
return actions
raise LLMResponseParseError(
"VertexAI response contained no 'actions'. "
"The model did not produce a valid next action.",
)
elif agent_type == "output":
if hasattr(parsed_object, "output") and parsed_object.output:
return str(parsed_object.output)
raise LLMResponseParseError(
"VertexAI response contained no 'output'. "
"The model did not produce a final summary.",
)
except LLMResponseParseError:
raise
except Exception as e:
raise LLMResponseParseError(
f"Failed to parse VertexAI response: {type(e).__name__}: {e}",
cause=e,
)
else:
response = self.handle_gemini_execution(
agent=agent, prompt=prompt, context_id=context_id
)
try:
parsed_object = agent["response_format"].model_validate_json(response.text)
except Exception as e:
raise LLMResponseParseError(
"Gemini returned a response that could not be parsed. "
"The model may have produced malformed JSON output.",
cause=e,
)
if agent_type == "action":
if parsed_object.actions:
actions = parsed_object.actions[0]
extract_info_flag = parsed_object.extract_info
if extract_info_flag:
extractor.run_threaded_info_extraction(
task=user_prompt, actual_text=cleaned_dom["actual_text"]
)
return actions
raise LLMResponseParseError(
"Gemini response contained no 'actions'. "
"The model did not produce a valid next action.",
)
elif agent_type == "output":
return str(parsed_object.output)
[docs]
def process_action(
self,
cleaned_dom: Dict[str, Union[List, str]],
user_prompt: str,
action_history: str = None,
fail_reason: str = None,
extraction_format: BaseModel = None,
context_id: str = None,
action_status: bool = None,
) -> PlaywrightResponse:
"""
Processes the current DOM and returns the next PlaywrightAction to execute.
Args:
cleaned_dom: Dictionary of extracted DOM elements (hyperlinks, input_fields, clickable_fields, actual_text).
user_prompt: The user's task instruction.
action_history: The full natural language history of actions taken so far.
fail_reason: Reason the previous action failed, if applicable.
extraction_format: Pydantic model defining the extraction output schema.
context_id: Unique identifier for this browser window (used in BFS mode).
action_status: Whether the previous action succeeded.
Returns:
A PlaywrightAction to execute next, or None if the task is complete.
"""
prompt = self._initialise_prompt(
cleaned_dom=cleaned_dom,
user_prompt=user_prompt,
main_instruction=general_prompt[self.engine.provider],
action_history=action_history if action_history else "",
fail_reason=fail_reason if fail_reason else "",
action_status=action_status if action_status else "",
)
extractor = ExtractionAgent(engine=self.engine, extraction_format=extraction_format)
return self._call_model(
agent=self.action_agent,
prompt=prompt,
agent_type="action",
cleaned_dom=cleaned_dom,
context_id=context_id,
extractor=extractor,
user_prompt=user_prompt,
)
[docs]
def get_output(
self, cleaned_dom: Dict[str, Union[List, str]], user_prompt: str, context_id: str = None
) -> str:
"""
Gets the final text output from the model based on the current page state.
"""
prompt = self._initialise_prompt(
cleaned_dom=cleaned_dom, user_prompt=user_prompt, main_instruction=output_prompt
)
return self._call_model(
agent=self.output_agent, prompt=prompt, agent_type="output", context_id=context_id
)