Spaces:

smolagents
/

computer-use-agent

Running

App Files Files Community

Amir Mahla commited on Oct 17

Commit

c9554cf

1 Parent(s): af1ae43

MOCK backend

Browse files

Files changed (25) hide show

cua2-core/pyproject.toml +1 -3
cua2-core/src/cua2-core/models/models.py +0 -95
cua2-core/src/cua2-core/services/agent_service.py +0 -130
cua2-core/src/cua2-core/services/agents/get_agents.py +0 -57
cua2-core/src/cua2-core/services/agents/normalized_1000_agent.py +0 -293
cua2-core/src/cua2-core/services/agents/normalized_agent.py +0 -282
cua2-core/src/cua2-core/services/agents/pixel_coordonates_agent.py +0 -317
cua2-core/src/cua2-core/services/agents/prompt.py +0 -548
cua2-core/src/cua2-core/services/models/anthropic.py +0 -10
cua2-core/src/cua2-core/services/models/gemini.py +0 -0
cua2-core/src/cua2-core/services/models/get_model.py +0 -12
cua2-core/src/cua2-core/services/models/qwen.py +0 -0
cua2-core/src/{cua2-core → cua2_core}/__init__.py +0 -0
cua2-core/src/{cua2-core → cua2_core}/app.py +2 -2
cua2-core/src/{cua2-core → cua2_core}/main.py +7 -7
cua2-core/src/cua2_core/models/__init__.py +2 -0
cua2-core/src/cua2_core/models/models.py +221 -0
cua2-core/src/cua2_core/routes/__init__.py +2 -0
cua2-core/src/{cua2-core → cua2_core}/routes/routes.py +3 -3
cua2-core/src/{cua2-core → cua2_core}/routes/websocket.py +35 -30
cua2-core/src/cua2_core/services/__init__.py +2 -0
cua2-core/src/cua2_core/services/agent_service.py +172 -0
cua2-core/src/cua2_core/services/simulation_metadata/simulated_trace.json +62 -0
cua2-core/src/cua2_core/websocket/__init__.py +2 -0
cua2-core/src/{cua2-core → cua2_core}/websocket/websocket_manager.py +4 -4

cua2-core/pyproject.toml CHANGED Viewed

@@ -6,7 +6,6 @@ build-backend = "hatchling.build"
 name = "cua2-core"
 version = "0.0.0-dev.0"
 description = "Backend API server for Computer Use Agent"
-readme = "README.md"
 authors = [{ name = "Amir Mahla", email = "[email protected]" }]
 keywords = ["fastapi", "api", "backend", "automation"]
 classifiers = [
@@ -61,12 +60,11 @@ Homepage = "https://github.com/huggingface/CUA2"
 Repository = "https://github.com/huggingface/CUA2"
 [tool.hatch.build.targets.wheel]
-packages = ["src/cua2-core"]
 [tool.hatch.build.targets.sdist]
 include = [
     "/src",
-    "/README.md",
 ]
 [tool.coverage.run]

 name = "cua2-core"
 version = "0.0.0-dev.0"
 description = "Backend API server for Computer Use Agent"
 authors = [{ name = "Amir Mahla", email = "[email protected]" }]
 keywords = ["fastapi", "api", "backend", "automation"]
 classifiers = [
 Repository = "https://github.com/huggingface/CUA2"
 [tool.hatch.build.targets.wheel]
+packages = ["src/cua2_core"]
 [tool.hatch.build.targets.sdist]
 include = [
     "/src",
 ]
 [tool.coverage.run]

cua2-core/src/cua2-core/models/models.py DELETED Viewed

@@ -1,95 +0,0 @@
-import json
-import os
-from datetime import datetime
-from enum import Enum
-from typing import Literal, Optional
-from pydantic import BaseModel, model_validator
-class AgentMetadata(BaseModel):
-    """Metadata for agent execution"""
-    inputTokensUsed: int
-    outputTokensUsed: int
-    timeTaken: float  # in seconds
-    numberOfSteps: int
-class AgentType(str, Enum):
-    """Agent type"""
-    PIXEL_COORDINATES = "pixel_coordinates"
-    NORMALIZED_1000_COORDINATES = "normalized_1000_coordinates"
-    NORMALIZED_COORDINATES = "normalized_coordinates"
-class ActiveTask(BaseModel):
-    """Active task"""
-    message_id: str
-    content: str
-    model_id: str
-    start_time: datetime
-    status: str
-    @property
-    def trace_path(self):
-        """Trace path"""
-        return f"data/trace-{self.message_id}-{self.model_id}"
-    @model_validator(mode="after")
-    def validate_model_id(self):
-        """Validate model ID"""
-        os.makedirs(self.trace_path, exist_ok=True)
-        with open(f"{self.trace_path}/user_tasks.json", "w") as f:
-            json.dump(self.model_dump(mode="json"), f, indent=2)
-        return self
-class WebSocketEvent(BaseModel):
-    """WebSocket event structure"""
-    type: Literal[
-        "agent_start",
-        "agent_progress",
-        "agent_complete",
-        "agent_error",
-        "vnc_url_set",
-        "vnc_url_unset",
-        "heartbeat",
-    ]
-    content: Optional[str] = None
-    metadata: Optional[AgentMetadata] = None
-    messageId: Optional[str] = None
-    vncUrl: Optional[str] = None
-class UserTaskMessage(BaseModel):
-    """Message sent from frontend to backend"""
-    type: Literal["user_task"]
-    content: str
-    model_id: str
-    timestamp: str
-class AgentMessage(BaseModel):
-    """Agent message structure"""
-    id: str
-    type: Literal["user", "agent"]
-    content: str
-    timestamp: datetime
-    metadata: Optional[AgentMetadata] = None
-    isLoading: Optional[bool] = None
-    truncated: Optional[bool] = None
-class HealthResponse(BaseModel):
-    """Health check response"""
-    status: str
-    timestamp: datetime
-    websocket_connections: int

cua2-core/src/cua2-core/services/agent_service.py DELETED Viewed

@@ -1,130 +0,0 @@
-import asyncio
-import uuid
-from datetime import datetime
-from typing import Optional
-from smolagents import Model
-from backend.models.models import ActiveTask, AgentMetadata
-from backend.services.agents.get_agents import get_agent
-from backend.services.models.get_model import get_model
-from backend.websocket.websocket_manager import WebSocketManager
-from computer_use_studio import Sandbox
-from computer_use_studio.logger import get_logger
-logger = get_logger(__name__)
-class AgentService:
-    """Service for handling agent tasks and processing"""
-    def __init__(self, websocket_manager):
-        self.active_tasks: dict[str, ActiveTask] = {}
-        self.websocket_manager: WebSocketManager = websocket_manager
-    async def process_user_task(self, content: str, model_id: str) -> str:
-        """Process a user task and return the message ID"""
-        message_id = str(uuid.uuid4())
-        while message_id in self.active_tasks.keys():
-            message_id = str(uuid.uuid4())
-        # Store the task
-        self.active_tasks[message_id] = ActiveTask(
-            message_id=message_id,
-            content=content,
-            model_id=model_id,
-            start_time=datetime.now(),
-            status="processing",
-        )
-        # Determine the agent type based on the content of the task (TODO: implement agent type detection using LLM)
-        prompt_type = "FORM_SYSTEM_PROMPT"
-        # Start the agent processing in the background
-        asyncio.create_task(
-            self._simulate_agent_processing(content, model_id, message_id, prompt_type)
-        )
-        return message_id
-    #     async def _simulate_agent_processing(self, message_id: str, content: str):
-    #         """Simulate agent processing with progress updates"""
-    #         try:
-    #             # Send agent start event
-    #             await self.websocket_manager.send_agent_start(
-    #                 content=f"Starting task: {content}", message_id=message_id
-    #             )
-    #
-    #             # Simulate processing steps
-    #             steps = [
-    #                 "Analyzing task requirements...",
-    #                 "Planning execution steps...",
-    #                 "Initializing computer interface...",
-    #                 "Executing task commands...",
-    #                 "Verifying results...",
-    #                 "Finalizing task completion...",
-    #             ]
-    #
-    #             for i, step in enumerate(steps):
-    #                 await asyncio.sleep(2)  # Simulate processing time
-    #
-    #                 # Send progress update
-    #                 await self.websocket_manager.send_agent_progress(
-    #                     content=f"{step} ({i + 1}/{len(steps)})", message_id=message_id
-    #                 )
-    #
-    #                 # Simulate VNC URL events during processing
-    #                 if i == 2:  # After "Initializing computer interface..."
-    #                     # Set VNC URL when computer interface is ready
-    #                     vnc_url = "http://localhost:6080/vnc.html?host=localhost&port=5900&autoconnect=true"
-    #                     await self.websocket_manager.send_vnc_url_set(
-    #                         vnc_url=vnc_url,
-    #                         content="Computer interface ready, VNC stream connected",
-    #                     )
-    #                 elif i == 4:  # After "Verifying results..."
-    #                     # Unset VNC URL when task is almost complete
-    #                     await self.websocket_manager.send_vnc_url_unset(
-    #                         content="Task verification complete, disconnecting VNC stream"
-    #                     )
-    #
-    #             # Calculate metadata
-    #             end_time = datetime.now()
-    #             start_time = self.active_tasks[message_id]["start_time"]
-    #             time_taken = (end_time - start_time).total_seconds()
-    #
-    #             metadata = AgentMetadata(
-    #                 tokensUsed=150 + len(content) * 2,  # Simulate token usage
-    #                 timeTaken=time_taken,
-    #                 numberOfSteps=len(steps),
-    #             )
-    #
-    #             # Send completion event
-    #             await self.websocket_manager.send_agent_complete(
-    #                 content=f"Task completed successfully: {content}",
-    #                 message_id=message_id,
-    #                 metadata=metadata,
-    #             )
-    #
-    #             # Clean up
-    #             if message_id in self.active_tasks:
-    #                 del self.active_tasks[message_id]
-    #
-    #         except Exception as e:
-    #             # Send error event
-    #             await self.websocket_manager.send_agent_error(
-    #                 content=f"Error processing task: {str(e)}", message_id=message_id
-    #             )
-    #
-    #             # Clean up
-    #             if message_id in self.active_tasks:
-    #                 del self.active_tasks[message_id]
-    def get_active_tasks(self) -> dict:
-        """Get currently active tasks"""
-        return self.active_tasks.copy()
-    def get_task_status(self, message_id: str) -> Optional[dict]:
-        """Get status of a specific task"""
-        return self.active_tasks.get(message_id)

cua2-core/src/cua2-core/services/agents/get_agents.py DELETED Viewed

@@ -1,57 +0,0 @@
-from typing import Annotated, TypeAlias
-from pydantic import Field
-from smolagents import Model
-from backend.models.models import AgentType
-from backend.services.agents.normalized_1000_agent import Normalized1000Agent
-from backend.services.agents.normalized_agent import NormalizedAgent
-from backend.services.agents.pixel_coordonates_agent import PixelCoordinatesAgent
-from backend.services.agents.prompt import (
-    Normalized1000CoordinatesSystemPrompt,
-    NormalizedCoordinatesSystemPrompt,
-    PixelCoordinatesSystemPrompt,
-)
-from computer_use_studio import Sandbox
-Agent: TypeAlias = Annotated[
-    PixelCoordinatesAgent | Normalized1000Agent | NormalizedAgent,
-    Field(discriminator="AGENT_TYPE"),
-]
-def get_agent(
-    model: Model,
-    desktop: Sandbox,
-    agent_type: AgentType,
-    prompt_type: str,
-    data_dir: str,
-    **kwargs,
-) -> Agent:
-    """Get the agent by type"""
-    if agent_type == AgentType.PIXEL_COORDINATES:
-        return PixelCoordinatesAgent(
-            model=model,
-            desktop=desktop,
-            system_prompt=PixelCoordinatesSystemPrompt[prompt_type].value,
-            data_dir=data_dir,
-            **kwargs,
-        )
-    elif agent_type == AgentType.NORMALIZED_1000_COORDINATES:
-        return Normalized1000Agent(
-            model=model,
-            desktop=desktop,
-            system_prompt=Normalized1000CoordinatesSystemPrompt[prompt_type].value,
-            data_dir=data_dir,
-            **kwargs,
-        )
-    elif agent_type == AgentType.NORMALIZED_COORDINATES:
-        return Normalized1000Agent(
-            model=model,
-            desktop=desktop,
-            system_prompt=NormalizedCoordinatesSystemPrompt[prompt_type].value,
-            data_dir=data_dir,
-            **kwargs,
-        )
-    else:
-        raise ValueError(f"Invalid agent type: {agent_type}")

cua2-core/src/cua2-core/services/agents/normalized_1000_agent.py DELETED Viewed

@@ -1,293 +0,0 @@
-import time
-import unicodedata
-from typing import List, Literal
-# SmolaAgents imports
-from smolagents import Model, Tool, tool
-from smolagents.monitoring import LogLevel
-from backend.models.models import AgentType
-from backend.services.agents.prompt import Normalized1000CoordinatesSystemPrompt
-from computer_use_studio import DesktopAgentBase, Sandbox
-class Normalized1000Agent(DesktopAgentBase):
-    """Agent for desktop automation with normalized coordinates (0 to 1000)"""
-    AGENT_TYPE = AgentType.NORMALIZED_1000_COORDINATES
-    def __init__(
-        self,
-        model: Model,
-        data_dir: str,
-        desktop: Sandbox,
-        system_prompt: Normalized1000CoordinatesSystemPrompt,
-        tools: List[Tool] | None = None,
-        max_steps: int = 20,
-        verbosity_level: LogLevel = LogLevel.INFO,
-        planning_interval: int | None = None,
-        use_v1_prompt: bool = False,
-        **kwargs,
-    ):
-        super().__init__(
-            model=model,
-            data_dir=data_dir,
-            desktop=desktop,
-            system_prompt=system_prompt,
-            tools=tools,
-            max_steps=max_steps,
-            verbosity_level=verbosity_level,
-            planning_interval=planning_interval,
-            use_v1_prompt=use_v1_prompt,
-            **kwargs,
-        )
-    def _normalize_to_pixel(self, norm_x: int, norm_y: int) -> tuple[int, int]:
-        """
-        Convert normalized coordinates (0-1000) to pixel coordinates
-        Args:
-            norm_x: Normalized x coordinate (0 to 1000)
-            norm_y: Normalized y coordinate (0 to 1000)
-        Returns:
-            Tuple of (pixel_x, pixel_y)
-        """
-        # Clamp values to valid range
-        norm_x = max(0, min(1000, norm_x))
-        norm_y = max(0, min(1000, norm_y))
-        # Convert from 0-1000 range to 0-1 range, then to pixels
-        norm_x_float = norm_x / 1000.0
-        norm_y_float = norm_y / 1000.0
-        pixel_x = int(norm_x_float * self.width)
-        pixel_y = int(norm_y_float * self.height)
-        # Ensure we don't go outside screen bounds
-        pixel_x = max(0, min(self.width - 1, pixel_x))
-        pixel_y = max(0, min(self.height - 1, pixel_y))
-        return pixel_x, pixel_y
-    def _setup_desktop_tools(self):
-        """Register all desktop tools with normalized coordinate support (0-1000)"""
-        @tool
-        def click(x: int, y: int) -> str:
-            """
-            Performs a left-click at the specified normalized coordinates
-            Args:
-                x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
-                y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
-            """
-            pixel_x, pixel_y = self._normalize_to_pixel(x, y)
-            self.desktop.left_click(pixel_x, pixel_y)
-            self.click_coordinates = (pixel_x, pixel_y)
-            self.logger.log(
-                f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
-            )
-            time.sleep(1)
-            return f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
-        @tool
-        def right_click(x: int, y: int) -> str:
-            """
-            Performs a right-click at the specified normalized coordinates
-            Args:
-                x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
-                y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
-            """
-            pixel_x, pixel_y = self._normalize_to_pixel(x, y)
-            self.desktop.right_click(pixel_x, pixel_y)
-            self.click_coordinates = (pixel_x, pixel_y)
-            self.logger.log(
-                f"Right-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
-            )
-            return f"Right-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
-        @tool
-        def double_click(x: int, y: int) -> str:
-            """
-            Performs a double-click at the specified normalized coordinates
-            Args:
-                x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
-                y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
-            """
-            pixel_x, pixel_y = self._normalize_to_pixel(x, y)
-            self.desktop.double_click(pixel_x, pixel_y)
-            self.click_coordinates = (pixel_x, pixel_y)
-            self.logger.log(
-                f"Double-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
-            )
-            return f"Double-clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
-        @tool
-        def move_mouse(x: int, y: int) -> str:
-            """
-            Moves the mouse cursor to the specified normalized coordinates
-            Args:
-                x: The normalized x coordinate (0 to 1000, where 0 is left edge, 1000 is right edge)
-                y: The normalized y coordinate (0 to 1000, where 0 is top edge, 1000 is bottom edge)
-            """
-            pixel_x, pixel_y = self._normalize_to_pixel(x, y)
-            self.desktop.move_mouse(pixel_x, pixel_y)
-            self.logger.log(
-                f"Moved mouse to normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
-            )
-            return f"Moved mouse to normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
-        def normalize_text(text):
-            return "".join(
-                c
-                for c in unicodedata.normalize("NFD", text)
-                if not unicodedata.combining(c)
-            )
-        @tool
-        def write(text: str) -> str:
-            """
-            Types the specified text at the current cursor position.
-            Args:
-                text: The text to type
-            """
-            # clean_text = normalize_text(text)
-            self.desktop.write(text, delay_in_ms=10)
-            self.logger.log(f"Typed text: '{text}'")
-            time.sleep(1)
-            return f"Typed text: '{text}'"
-        @tool
-        def press(key: str) -> str:
-            """
-            Presses a keyboard key or combination of keys
-            Args:
-                key: The key to press (e.g. "enter", "space", "backspace", etc.) or a multiple keys string to press, for example "ctrl+a" or "ctrl+shift+a".
-            """
-            self.desktop.press(key)
-            self.logger.log(f"Pressed key: {key}")
-            time.sleep(0.1)
-            return f"Pressed key: {key}"
-        @tool
-        def drag(x1: int, y1: int, x2: int, y2: int) -> str:
-            """
-            Clicks at normalized coordinates [x1, y1], drags mouse to [x2, y2], then release click.
-            Args:
-                x1: origin normalized x coordinate (0 to 1000)
-                y1: origin normalized y coordinate (0 to 1000)
-                x2: end normalized x coordinate (0 to 1000)
-                y2: end normalized y coordinate (0 to 1000)
-            """
-            pixel_x1, pixel_y1 = self._normalize_to_pixel(x1, y1)
-            pixel_x2, pixel_y2 = self._normalize_to_pixel(x2, y2)
-            self.desktop.drag((pixel_x1, pixel_y1), (pixel_x2, pixel_y2))
-            message = f"Dragged and dropped from normalized [{x1}, {y1}] to [{x2}, {y2}] -> pixels [{pixel_x1}, {pixel_y1}] to [{pixel_x2}, {pixel_y2}]"
-            self.logger.log(message)
-            return message
-        @tool
-        def scroll(
-            x: int,
-            y: int,
-            direction: Literal["up", "down"] = "down",
-            amount: int = 2,
-        ) -> str:
-            """
-            Moves the mouse to selected normalized coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
-            Args:
-                x: The normalized x coordinate (0 to 1000) of the element to scroll/zoom
-                y: The normalized y coordinate (0 to 1000) of the element to scroll/zoom
-                direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
-                amount: The amount to scroll. A good amount is 1 or 2.
-            """
-            pixel_x, pixel_y = self._normalize_to_pixel(x, y)
-            self.desktop.move_mouse(pixel_x, pixel_y)
-            self.desktop.scroll(direction=direction, amount=amount)
-            message = f"Scrolled {direction} by {amount} at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
-            self.logger.log(message)
-            return message
-        @tool
-        def wait(seconds: float) -> str:
-            """
-            Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
-            Args:
-                seconds: Number of seconds to wait, generally 3 is enough.
-            """
-            time.sleep(seconds)
-            self.logger.log(f"Waited for {seconds} seconds")
-            return f"Waited for {seconds} seconds"
-        @tool
-        def open(file_or_url: str) -> str:
-            """
-            Directly opens a browser with the specified url or opens a file with the default application: use this at start of web searches rather than trying to click the browser or open a file by clicking.
-            Args:
-                file_or_url: The URL or file to open
-            """
-            self.desktop.open(file_or_url)
-            # Give it time to load
-            time.sleep(2)
-            self.logger.log(f"Opening: {file_or_url}")
-            return f"Opened: {file_or_url}"
-        @tool
-        def launch_app(app_name: str) -> str:
-            """
-            Launches the specified application.
-            Args:
-                app_name: the name of the application to launch
-            """
-            self.desktop.launch(app_name)
-            self.logger.log(f"Launched app: {app_name}")
-            return f"Launched app: {app_name}"
-        @tool
-        def execute(command: str) -> str:
-            """
-            Executes a terminal command in the desktop environment.
-            Args:
-                command: The command to execute
-            """
-            self.desktop.execute_command(command)
-            self.logger.log(f"Executed command: {command}")
-            return f"Executed command: {command}"
-        @tool
-        def refresh() -> str:
-            """
-            Refreshes the current web page if you're in a browser.
-            """
-            self.desktop.press(["ctrl", "r"])
-            self.logger.log("Refreshed the current page")
-            return "Refreshed the current page"
-        @tool
-        def go_back() -> str:
-            """
-            Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
-            Args:
-            """
-            self.desktop.press(["alt", "left"])
-            self.logger.log("Went back one page")
-            return "Went back one page"
-        # Register the tools
-        self.tools["click"] = click
-        self.tools["right_click"] = right_click
-        self.tools["double_click"] = double_click
-        self.tools["move_mouse"] = move_mouse
-        self.tools["write"] = write
-        self.tools["press"] = press
-        self.tools["scroll"] = scroll
-        self.tools["wait"] = wait
-        self.tools["open"] = open
-        self.tools["go_back"] = go_back
-        self.tools["drag"] = drag
-        self.tools["launch_app"] = launch_app
-        self.tools["execute"] = execute
-        self.tools["refresh"] = refresh
-        self.tools["refresh"] = refresh
-        self.tools["execute"] = execute
-        self.tools["refresh"] = refresh
-        self.tools["refresh"] = refresh

cua2-core/src/cua2-core/services/agents/normalized_agent.py DELETED Viewed

@@ -1,282 +0,0 @@
-import time
-import unicodedata
-from typing import List, Literal
-# SmolaAgents imports
-from smolagents import Model, Tool, tool
-from smolagents.monitoring import LogLevel
-from backend.models.models import AgentType
-from backend.services.agents.prompt import NormalizedCoordinatesSystemPrompt
-from computer_use_studio import DesktopAgentBase, Sandbox
-class NormalizedAgent(DesktopAgentBase):
-    """Agent for desktop automation with normalized coordinates (0.0 to 1.0)"""
-    AGENT_TYPE = AgentType.NORMALIZED_COORDINATES
-    def __init__(
-        self,
-        model: Model,
-        data_dir: str,
-        desktop: Sandbox,
-        system_prompt: NormalizedCoordinatesSystemPrompt,
-        tools: List[Tool] | None = None,
-        max_steps: int = 20,
-        verbosity_level: LogLevel = LogLevel.INFO,
-        planning_interval: int | None = None,
-        use_v1_prompt: bool = False,
-        **kwargs,
-    ):
-        super().__init__(
-            model=model,
-            data_dir=data_dir,
-            desktop=desktop,
-            system_prompt=system_prompt,
-            tools=tools,
-            max_steps=max_steps,
-            verbosity_level=verbosity_level,
-            planning_interval=planning_interval,
-            use_v1_prompt=use_v1_prompt,
-            **kwargs,
-        )
-    def _normalize_to_pixel(self, norm_x: float, norm_y: float) -> tuple[int, int]:
-        """
-        Convert normalized coordinates (0.0-1.0) to pixel coordinates
-        Args:
-            norm_x: Normalized x coordinate (0.0 to 1.0)
-            norm_y: Normalized y coordinate (0.0 to 1.0)
-        Returns:
-            Tuple of (pixel_x, pixel_y)
-        """
-        # Clamp values to valid range
-        norm_x = max(0.0, min(1.0, norm_x))
-        norm_y = max(0.0, min(1.0, norm_y))
-        pixel_x = int(norm_x * self.width)
-        pixel_y = int(norm_y * self.height)
-        # Ensure we don't go outside screen bounds
-        pixel_x = max(0, min(self.width - 1, pixel_x))
-        pixel_y = max(0, min(self.height - 1, pixel_y))
-        return pixel_x, pixel_y
-    def _setup_desktop_tools(self):
-        """Register all desktop tools with normalized coordinate support"""
-        @tool
-        def click(x: float, y: float) -> str:
-            """
-            Performs a left-click at the specified normalized coordinates
-            Args:
-                x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
-                y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
-            """
-            pixel_x, pixel_y = self._normalize_to_pixel(x, y)
-            self.desktop.left_click(pixel_x, pixel_y)
-            self.click_coordinates = (pixel_x, pixel_y)
-            self.logger.log(
-                f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
-            )
-            return f"Clicked at normalized coordinates ({x}, {y}) -> pixels ({pixel_x}, {pixel_y})"
-        @tool
-        def right_click(x: float, y: float) -> str:
-            """
-            Performs a right-click at the specified normalized coordinates
-            Args:
-                x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
-                y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
-            """
-            pixel_x, pixel_y = self._normalize_to_pixel(x, y)
-            self.desktop.right_click(pixel_x, pixel_y)
-            self.click_coordinates = (pixel_x, pixel_y)
-            self.logger.log(
-                f"Right-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
-            )
-            return f"Right-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
-        @tool
-        def double_click(x: float, y: float) -> str:
-            """
-            Performs a double-click at the specified normalized coordinates
-            Args:
-                x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
-                y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
-            """
-            pixel_x, pixel_y = self._normalize_to_pixel(x, y)
-            self.desktop.double_click(pixel_x, pixel_y)
-            self.click_coordinates = (pixel_x, pixel_y)
-            self.logger.log(
-                f"Double-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
-            )
-            return f"Double-clicked at normalized coordinates ({pixel_x}, {pixel_y})"
-        @tool
-        def move_mouse(x: float, y: float) -> str:
-            """
-            Moves the mouse cursor to the specified normalized coordinates
-            Args:
-                x: The normalized x coordinate (0.0 to 1.0, where 0.0 is left edge, 1.0 is right edge)
-                y: The normalized y coordinate (0.0 to 1.0, where 0.0 is top edge, 1.0 is bottom edge)
-            """
-            pixel_x, pixel_y = self._normalize_to_pixel(x, y)
-            self.desktop.move_mouse(pixel_x, pixel_y)
-            self.logger.log(
-                f"Moved mouse to normalized coordinates ({pixel_x}, {pixel_y})"
-            )
-            return f"Moved mouse to normalized coordinates ({pixel_x}, {pixel_y})"
-        def normalize_text(text):
-            return "".join(
-                c
-                for c in unicodedata.normalize("NFD", text)
-                if not unicodedata.combining(c)
-            )
-        @tool
-        def write(text: str) -> str:
-            """
-            Types the specified text at the current cursor position.
-            Args:
-                text: The text to type
-            """
-            # clean_text = normalize_text(text)
-            self.desktop.write(text, delay_in_ms=10)
-            self.logger.log(f"Typed text: '{text}'")
-            return f"Typed text: '{text}'"
-        @tool
-        def press(key: str) -> str:
-            """
-            Presses a keyboard key or combination of keys
-            Args:
-                key: The key to press (e.g. "enter", "space", "backspace", etc.) or a multiple keys string to press, for example "ctrl+a" or "ctrl+shift+a".
-            """
-            self.desktop.press(key)
-            self.logger.log(f"Pressed key: {key}")
-            return f"Pressed key: {key}"
-        @tool
-        def drag(x1: float, y1: float, x2: float, y2: float) -> str:
-            """
-            Clicks at normalized coordinates [x1, y1], drags mouse to [x2, y2], then release click.
-            Args:
-                x1: origin normalized x coordinate (0.0 to 1.0)
-                y1: origin normalized y coordinate (0.0 to 1.0)
-                x2: end normalized x coordinate (0.0 to 1.0)
-                y2: end normalized y coordinate (0.0 to 1.0)
-            """
-            pixel_x1, pixel_y1 = self._normalize_to_pixel(x1, y1)
-            pixel_x2, pixel_y2 = self._normalize_to_pixel(x2, y2)
-            self.desktop.drag((pixel_x1, pixel_y1), (pixel_x2, pixel_y2))
-            message = f"Dragged and dropped from normalized [{pixel_x1}, {pixel_y1}] to [{pixel_x2}, {pixel_y2}]"
-            self.logger.log(message)
-            return message
-        @tool
-        def scroll(
-            x: float,
-            y: float,
-            direction: Literal["up", "down"] = "down",
-            amount: int = 2,
-        ) -> str:
-            """
-            Moves the mouse to selected normalized coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
-            Args:
-                x: The normalized x coordinate (0.0 to 1.0) of the element to scroll/zoom
-                y: The normalized y coordinate (0.0 to 1.0) of the element to scroll/zoom
-                direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
-                amount: The amount to scroll. A good amount is 1 or 2.
-            """
-            pixel_x, pixel_y = self._normalize_to_pixel(x, y)
-            self.desktop.move_mouse(pixel_x, pixel_y)
-            self.desktop.scroll(direction=direction, amount=amount)
-            message = f"Scrolled {direction} by {amount} at normalized coordinates ({pixel_x}, {pixel_y})"
-            self.logger.log(message)
-            return message
-        @tool
-        def wait(seconds: float) -> str:
-            """
-            Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
-            Args:
-                seconds: Number of seconds to wait, generally 3 is enough.
-            """
-            time.sleep(seconds)
-            self.logger.log(f"Waited for {seconds} seconds")
-            return f"Waited for {seconds} seconds"
-        @tool
-        def open(file_or_url: str) -> str:
-            """
-            Directly opens a browser with the specified url or opens a file with the default application: use this at start of web searches rather than trying to click the browser or open a file by clicking.
-            Args:
-                file_or_url: The URL or file to open
-            """
-            self.desktop.open(file_or_url)
-            # Give it time to load
-            time.sleep(2)
-            self.logger.log(f"Opening: {file_or_url}")
-            return f"Opened: {file_or_url}"
-        @tool
-        def launch_app(app_name: str) -> str:
-            """
-            Launches the specified application.
-            Args:
-                app_name: the name of the application to launch
-            """
-            self.desktop.launch(app_name)
-            self.logger.log(f"Launched app: {app_name}")
-            return f"Launched app: {app_name}"
-        @tool
-        def execute(command: str) -> str:
-            """
-            Executes a terminal command in the desktop environment.
-            Args:
-                command: The command to execute
-            """
-            self.desktop.execute_command(command)
-            self.logger.log(f"Executed command: {command}")
-            return f"Executed command: {command}"
-        @tool
-        def refresh() -> str:
-            """
-            Refreshes the current web page if you're in a browser.
-            """
-            self.desktop.press(["ctrl", "r"])
-            self.logger.log("Refreshed the current page")
-            return "Refreshed the current page"
-        @tool
-        def go_back() -> str:
-            """
-            Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
-            Args:
-            """
-            self.desktop.press(["alt", "left"])
-            self.logger.log("Went back one page")
-            return "Went back one page"
-        # Register the tools
-        self.tools["click"] = click
-        self.tools["right_click"] = right_click
-        self.tools["double_click"] = double_click
-        self.tools["move_mouse"] = move_mouse
-        self.tools["write"] = write
-        self.tools["press"] = press
-        self.tools["scroll"] = scroll
-        self.tools["wait"] = wait
-        self.tools["open"] = open
-        self.tools["go_back"] = go_back
-        self.tools["drag"] = drag
-        self.tools["launch_app"] = launch_app
-        self.tools["execute"] = execute
-        self.tools["refresh"] = refresh

cua2-core/src/cua2-core/services/agents/pixel_coordonates_agent.py DELETED Viewed

@@ -1,317 +0,0 @@
-import time
-import unicodedata
-from typing import List, Literal
-# SmolaAgents imports
-from smolagents import Model, Tool, tool
-from smolagents.monitoring import LogLevel
-from backend.models.models import AgentType
-from backend.services.agents.prompt import PixelCoordinatesSystemPrompt
-from computer_use_studio import DesktopAgentBase, Sandbox
-class PixelCoordinatesAgent(DesktopAgentBase):
-    """Agent for desktop automation"""
-    AGENT_TYPE = AgentType.PIXEL_COORDINATES
-    def __init__(
-        self,
-        model: Model,
-        data_dir: str,
-        desktop: Sandbox,
-        system_prompt: PixelCoordinatesSystemPrompt,
-        tools: List[Tool] | None = None,
-        max_steps: int = 20,
-        verbosity_level: LogLevel = LogLevel.INFO,
-        planning_interval: int | None = None,
-        use_v1_prompt: bool = False,
-        **kwargs,
-    ):
-        super().__init__(
-            model=model,
-            data_dir=data_dir,
-            desktop=desktop,
-            system_prompt=system_prompt,
-            tools=tools,
-            max_steps=max_steps,
-            verbosity_level=verbosity_level,
-            planning_interval=planning_interval,
-            use_v1_prompt=use_v1_prompt,
-            **kwargs,
-        )
-        # OPTIONAL: Add a custom prompt template - see src/computer_use_studio/desktop_agent/desktop_agent_base.py for more details about the default prompt template
-        # self.prompt_templates["system_prompt"] = CUSTOM_PROMPT_TEMPLATE.replace(
-        #     "<<resolution_x>>", str(self.width)
-        # ).replace("<<resolution_y>>", str(self.height))
-        # Important: Change the prompt to get better results, depending on your action space.
-    def _setup_desktop_tools(self):
-        """Register all desktop tools"""
-        @tool
-        def click(x: int, y: int) -> str:
-            """
-            Performs a left-click at the specified coordinates
-            Args:
-                x: The x coordinate (horizontal position)
-                y: The y coordinate (vertical position)
-            """
-            self.desktop.left_click(x, y)
-            self.click_coordinates = (x, y)
-            self.logger.log(f"Clicked at coordinates ({x}, {y})")
-            return f"Clicked at coordinates ({x}, {y})"
-        @tool
-        def right_click(x: int, y: int) -> str:
-            """
-            Performs a right-click at the specified coordinates
-            Args:
-                x: The x coordinate (horizontal position)
-                y: The y coordinate (vertical position)
-            """
-            self.desktop.right_click(x, y)
-            self.click_coordinates = (x, y)
-            self.logger.log(f"Right-clicked at coordinates ({x}, {y})")
-            return f"Right-clicked at coordinates ({x}, {y})"
-        @tool
-        def double_click(x: int, y: int) -> str:
-            """
-            Performs a double-click at the specified coordinates
-            Args:
-                x: The x coordinate (horizontal position)
-                y: The y coordinate (vertical position)
-            """
-            self.desktop.double_click(x, y)
-            self.click_coordinates = (x, y)
-            self.logger.log(f"Double-clicked at coordinates ({x}, {y})")
-            return f"Double-clicked at coordinates ({x}, {y})"
-        @tool
-        def move_mouse(x: int, y: int) -> str:
-            """
-            Moves the mouse cursor to the specified coordinates
-            Args:
-                x: The x coordinate (horizontal position)
-                y: The y coordinate (vertical position)
-            """
-            self.desktop.move_mouse(x, y)
-            self.logger.log(f"Moved mouse to coordinates ({x}, {y})")
-            return f"Moved mouse to coordinates ({x}, {y})"
-        def normalize_text(text):
-            return "".join(
-                c
-                for c in unicodedata.normalize("NFD", text)
-                if not unicodedata.combining(c)
-            )
-        @tool
-        def write(text: str) -> str:
-            """
-            Types the specified text at the current cursor position.
-            Args:
-                text: The text to type
-            """
-            # clean_text = normalize_text(text)
-            self.desktop.write(text, delay_in_ms=10)
-            self.logger.log(f"Typed text: '{text}'")
-            return f"Typed text: '{text}'"
-        @tool
-        def press(key: str) -> str:
-            """
-            Presses a keyboard key or combination of keys
-            Args:
-                key: The key to press (e.g. "enter", "space", "backspace", etc.) or a multiple keys string to press, for example "ctrl+a" or "ctrl+shift+a".
-            """
-            self.desktop.press(key)
-            self.logger.log(f"Pressed key: {key}")
-            return f"Pressed key: {key}"
-        @tool
-        def drag(x1: int, y1: int, x2: int, y2: int) -> str:
-            """
-            Clicks [x1, y1], drags mouse to [x2, y2], then release click.
-            Args:
-                x1: origin x coordinate
-                y1: origin y coordinate
-                x2: end x coordinate
-                y2: end y coordinate
-            """
-            self.desktop.drag((x1, y1), (x2, y2))
-            message = f"Dragged and dropped from [{x1}, {y1}] to [{x2}, {y2}]"
-            self.logger.log(message)
-            return message
-        @tool
-        def scroll(
-            x: int, y: int, direction: Literal["up", "down"] = "down", amount: int = 2
-        ) -> str:
-            """
-            Moves the mouse to selected coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
-            Args:
-                x: The x coordinate (horizontal position) of the element to scroll/zoom
-                y: The y coordinate (vertical position) of the element to scroll/zoom
-                direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
-                amount: The amount to scroll. A good amount is 1 or 2.
-            """
-            self.desktop.move_mouse(x, y)
-            self.desktop.scroll(direction=direction, amount=amount)
-            message = f"Scrolled {direction} by {amount}"
-            self.logger.log(message)
-            return message
-        @tool
-        def wait(seconds: float) -> str:
-            """
-            Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
-            Args:
-                seconds: Number of seconds to wait, generally 3 is enough.
-            """
-            time.sleep(seconds)
-            self.logger.log(f"Waited for {seconds} seconds")
-            return f"Waited for {seconds} seconds"
-        @tool
-        def open(file_or_url: str) -> str:
-            """
-            Directly opens a browser with the specified url or opens a file with the default application: use this at start of web searches rather than trying to click the browser or open a file by clicking.
-            Args:
-                file_or_url: The URL or file to open
-            """
-            self.desktop.open(file_or_url)
-            # Give it time to load
-            time.sleep(2)
-            self.logger.log(f"Opening: {file_or_url}")
-            return f"Opened: {file_or_url}"
-        @tool
-        def launch_app(app_name: str) -> str:
-            """
-            Launches the specified application.
-            Args:
-                app_name: the name of the application to launch
-            """
-            self.desktop.launch(app_name)
-            self.logger.log(f"Launched app: {app_name}")
-            return f"Launched app: {app_name}"
-        @tool
-        def execute(command: str) -> str:
-            """
-            Executes a terminal command in the desktop environment.
-            Args:
-                command: The command to execute
-            """
-            self.desktop.execute_command(command)
-            self.logger.log(f"Executed command: {command}")
-            return f"Executed command: {command}"
-        @tool
-        def refresh() -> str:
-            """
-            Refreshes the current web page if you're in a browser.
-            """
-            self.desktop.press(["ctrl", "r"])
-            self.logger.log("Refreshed the current page")
-            return "Refreshed the current page"
-        @tool
-        def go_back() -> str:
-            """
-            Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
-            Args:
-            """
-            self.desktop.press(["alt", "left"])
-            self.logger.log("Went back one page")
-            return "Went back one page"
-        # Register the tools
-        self.tools["click"] = click
-        self.tools["right_click"] = right_click
-        self.tools["double_click"] = double_click
-        self.tools["move_mouse"] = move_mouse
-        self.tools["write"] = write
-        self.tools["press"] = press
-        self.tools["scroll"] = scroll
-        self.tools["wait"] = wait
-        self.tools["open"] = open
-        self.tools["go_back"] = go_back
-        self.tools["drag"] = drag
-        self.tools["launch_app"] = launch_app
-        self.tools["execute"] = execute
-        self.tools["refresh"] = refresh
-if __name__ == "__main__":
-    # ================================
-    # MODEL CONFIGURATION
-    # ================================
-    # import os
-    # from smolagents import OpenAIServerModel
-    # model = OpenAIServerModel(
-    #     model_id="gpt-4.1",
-    #     api_key=os.getenv("OPENAI_API_KEY"),
-    # )
-    # For Inference Endpoints
-    # from smolagents import HfApiModel
-    # model = HfApiModel(
-    #     model_id="Qwen/Qwen2.5-VL-72B-Instruct",
-    #     token=os.getenv("HF_TOKEN"),
-    #     provider="nebius",
-    # )
-    # For Transformer models
-    # from smolagents import TransformersModel
-    # model = TransformersModel(
-    #     model_id="Qwen/Qwen2.5-VL-72B-Instruct",
-    #     device_map="auto",
-    #     torch_dtype="auto",
-    #     trust_remote_code=True,
-    # )
-    # For other providers
-    from smolagents import LiteLLMModel
-    model = LiteLLMModel(model_id="anthropic/claude-sonnet-4-5-20250929")
-    # model = LiteLLMModel(model_id="gemini/gemini-2.5-flash")
-    # ================================
-    # RUN AGENT
-    # ================================
-    # Interactive task input loop
-    sandbox = None
-    agent = None
-    while True:
-        try:
-            task = get_user_input()
-            if task is None:
-                exit()
-            sandbox = Sandbox(headless=False, resolution=(1024, 1024))
-            sandbox.start_recording()
-            agent = FormAgent(model=model, data_dir="data", desktop=sandbox)
-            print("\n🤖 Agent is working on your task...")
-            print("-" * 60)
-            result = agent.run(task)
-            print("\n✅ Task completed successfully!")
-            print(f"📄 Result: {result}")
-        except Exception as e:
-            print(f"\n❌ Error occurred: {str(e)}")
-        finally:
-            if sandbox:
-                sandbox.end_recording("recording.mp4")
-            if agent:
-                agent.close()
-        print("\n" + "=" * 60)

cua2-core/src/cua2-core/services/agents/prompt.py DELETED Viewed

@@ -1,548 +0,0 @@
-from enum import Enum
-class PixelCoordinatesSystemPrompt(Enum):
-    """Pixel coordinates system prompt"""
-    FORM_SYSTEM_PROMPT = """You are a web form automation assistant that can control a remote desktop environment with a web browser open.
-The current date is <<current_date>>.
-<action_process>
-You will be given a task to complete in several steps (e.g. filling forms, signing up, logging in, submitting claims).
-At each step you will perform **one action**.
-After each action, you will receive an updated screenshot.
-Then you will proceed as follows, with these sections — do not skip any:
-Short term goal: ...
-What I see: ...
-Reflection: ...
-Action:
-```python
-tool_name(arguments)
-```<end_code>
-Always format your Action section as **Python code blocks** exactly as shown above.
-</action_process>
-<tools>
-On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
-{%- for tool in tools.values() %}
-- {{ tool.name }}: {{ tool.description }}
-    Takes inputs: {{tool.inputs}}
-    Returns an output of type: {{tool.output_type}}
-{%- endfor %}
-</tools>
-<web_form_guidelines>
-Look at the elements on the screen (input fields, checkboxes, buttons, dropdowns) to decide where to interact.
-The desktop has a resolution of <<resolution_x>>x<<resolution_y>> pixels — use that to decide mouse coordinates.
-**Never use hypothetical or assumed coordinates; always use real coordinates visible on the screenshot.**
-### Typical Web Form Interactions
-- **Input fields**: click in the field first to focus it, then use `write("text")`.
-- **Passwords**: type them just like text — `write("password123")`.
-- **Checkboxes / radio buttons**: use `click(x,y)` directly on the box/circle.
-- **Dropdown menus**: click to open, then click the option or use arrow keys + press("enter").
-- **Submit buttons**: identify clearly labelled “Sign up”, “Sign in”, “Submit” buttons and click at their coordinates.
-- **Captcha or 2FA**: wait or prompt for external handling (do not bypass security).
-- **Pop-ups**: try `press("enter")` to confirm or `press("escape")` to close if they block your action.
-### Grouping Multiple Inputs
-- If multiple fields, checkboxes, or similar controls are clearly visible **and** can be filled/clicked in sequence without ambiguity, you may include **several actions in one code block**.
-- Keep each Action still in a **Python code block** but with multiple tool calls inside, for example:
-```python
-click(450, 320)        # Email field
-wait(0.1)
-write("[email protected]")
-click(450, 380)        # Password field
-wait(0.1)
-write("mypassword123")
-click(430, 600)        # Checkbox “Accept terms”
-wait(0.1)
-```<end_code>
-- Only group actions when:
-  1. They’re all part of the **same form or step**,
-  2. The screenshot clearly shows all elements and coordinates,
-  3. The order of operations is obvious.
-- Otherwise, default back to one Action per step.
-### Precision
-- Always **click before typing** to ensure the right field is active.
-- Always **scroll if needed** to bring elements into view before clicking.
-- Always **validate each action** via the screenshot before continuing.
-</web_form_guidelines>
-<task_resolution_example>
-For a task like “Sign up for an account and submit the form”:
-Step 1:
-Short term goal: I want to open the signup page.
-What I see: The browser is open on the homepage.
-Reflection: I will open the signup URL directly.
-Action:
-```python
-open("https://example.com/signup")
-wait(3)
-```<end_code>
-Step 2:
-Short term goal: I want to fill the “Email” field.
-What I see: I see the signup form with an “Email” field at (450, 320).
-Reflection: I will click inside the field then type my email.
-Action:
-```python
-click(450, 320)
-write("[email protected]")
-```<end_code>
-Step 3:
-Short term goal: I want to check the “I accept terms” checkbox.
-What I see: The checkbox is at (430, 600).
-Reflection: I will click it.
-Action:
-```python
-click(430, 600)
-```<end_code>
-Step 4:
-Short term goal: I want to submit the form.
-What I see: The “Sign Up” button at (500, 700).
-Reflection: I will click the button to submit.
-Action:
-```python
-click(500, 700)
-wait(3)
-```<end_code>
-Step 5:
-Short term goal: Verify signup completed.
-What I see: A confirmation page “Welcome [email protected]”.
-Reflection: Task complete.
-Action:
-```python
-final_answer("Signup completed")
-```<end_code>
-</task_resolution_example>
-<general_guidelines>
-# GUI Agent Guidelines for Web Forms
-## Environment Overview
-Ubuntu 22.04 XFCE4 desktop with Google Chrome/Chromium browser.
-Agent can fill forms, sign up, sign in, click checkboxes, submit claims.
-## Core Principles
-### 1. Screenshot Analysis
-- Always analyze the latest screenshot carefully before each action.
-- Validate that previous actions worked by examining the current state.
-- If an action didn’t work, try an alternative rather than repeating blindly.
-### 2. Action Execution
-- Execute one action or multiple actions at a time (grouped in one code block).
-- Wait for appropriate loading times using `wait()` but not indefinitely.
-- Scroll to bring hidden elements into view.
-### 3. Keyboard Shortcuts
-- Use `tab` to move between fields, `space` to toggle checkboxes, `enter` to submit forms.
-- Copy/paste: `ctrl+C`, `ctrl+V`.
-- Refresh page: `refresh()`.
-### 4. Error Recovery
-- If clicking doesn’t work, try double_click or right_click.
-- If typing doesn’t appear, ensure the field is focused with click.
-- If popups block the screen, try `press("enter")` or `press("escape")`.
-### 5. Security & Privacy
-- Don’t attempt to bypass captchas or 2FA automatically.
-- Don’t store credentials in plain text unless instructed.
-### 6. Final Answer
-- When the form is successfully submitted or the goal achieved, use:
-```python
-final_answer("Done")
-```<end_code>
-</general_guidelines>
-"""
-class Normalized1000CoordinatesSystemPrompt(Enum):
-    """Normalized 1000 coordinates system prompt"""
-    FORM_SYSTEM_PROMPT = """You are a web form automation assistant that can control a remote desktop environment with a web browser open.
-The current date is <<current_date>>.
-<action_process>
-You will be given a task to complete in several steps (e.g. filling forms, signing up, logging in, submitting claims).
-At each step you will perform **one action**.
-After each action, you will receive an updated screenshot.
-Then you will proceed as follows, with these sections — do not skip any:
-Short term goal: ...
-What I see: ...
-Reflection: ...
-Action:
-```python
-tool_name(arguments)
-```<end_code>
-Always format your Action section as **Python code blocks** exactly as shown above.
-</action_process>
-<tools>
-On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
-{%- for tool in tools.values() %}
-- {{ tool.name }}: {{ tool.description }}
-    Takes inputs: {{tool.inputs}}
-    Returns an output of type: {{tool.output_type}}
-{%- endfor %}
-</tools>
-<coordinate_system>
-**IMPORTANT: This system uses NORMALIZED COORDINATES (0 to 1000)**
-You must use normalized coordinates:
-- **x-coordinate**: 0 = left edge, 1000 = right edge of screen
-- **y-coordinate**: 0 = top edge, 1000 = bottom edge of screen
-- **Example**: Center of screen is (500, 500)
-- **Example**: Top-left corner is (0, 0)
-- **Example**: Bottom-right corner is (1000, 1000)
-When you see an element on the screenshot:
-1. Estimate its position relative to the screen dimensions
-2. Convert to normalized coordinates between 0 and 1000
-3. Use these normalized coordinates in your tool calls
-**Never use pixel coordinates directly - always use normalized coordinates between 0 and 1000**
-</coordinate_system>
-<web_form_guidelines>
-Look at the elements on the screen (input fields, checkboxes, buttons, dropdowns) to decide where to interact.
-**Always use normalized coordinates (0 to 1000) based on the element's relative position on the screen.**
-### Typical Web Form Interactions
-- **Input fields**: click in the field first to focus it, then use `write("text")`.
-- **Passwords**: type them just like text — `write("password123")`.
-- **Checkboxes / radio buttons**: use `click(x,y)` directly on the box/circle. Click on the box/circle itself at the left side of the text, not on the text label.
-- **Dropdown menus**: click to open, then click the option or use arrow keys + press("enter").
-- **Submit buttons**: identify clearly labelled "Sign up", "Sign in", "Submit" buttons and click at their normalized coordinates.
-- **Captcha or 2FA**: wait or prompt for external handling (do not bypass security).
-- **Pop-ups**: try `press("enter")` to confirm or `press("escape")` to close if they block your action.
-### Grouping Multiple Inputs
-- If multiple fields, checkboxes, or similar controls are clearly visible **and** can be filled/clicked in sequence without ambiguity, you may include **several actions in one code block**.
-- Keep each Action still in a **Python code block** but with multiple tool calls inside, for example:
-```python
-click(470, 300)        # Email field (normalized coordinates)
-write("[email protected]")
-click(470, 350)        # Password field (normalized coordinates)
-write("mypassword123")
-click(450, 550)        # Checkbox left side of the text "Accept terms" (normalized coordinates)
-```<end_code>
-- Only group actions when:
-  1. They're all part of the **same form or step**,
-  2. The screenshot clearly shows all elements and coordinates,
-  3. The order of operations is obvious.
-- Otherwise, default back to one Action per step.
-### Precision
-- Always **click before typing** to ensure the right field is active.
-- Always **scroll if needed** to bring elements into view before clicking.
-- Always **validate each action** via the screenshot before continuing.
-- Always use **normalized coordinates between 0 and 1000**.
-</web_form_guidelines>
-<task_resolution_example>
-For a task like "Sign up for an account and submit the form":
-Step 1:
-Short term goal: I want to open the signup page.
-What I see: The browser is open on the homepage.
-Reflection: I will open the signup URL directly.
-Action:
-```python
-open("https://example.com/signup")
-wait(3)
-```<end_code>
-Step 2:
-Short term goal: I want to fill the form fields that are currently visible.
-What I see: I see the signup form with "Email" and "Password" fields, plus a checkbox for accepting terms.
-Reflection: I will fill all the visible form fields in sequence - click the email field and type the email, then click the password field and type the password, then click the checkbox to accept terms.
-Action:
-```python
-click(470, 300)        # Email field (normalized coordinates)
-write("[email protected]")
-click(470, 350)        # Password field (normalized coordinates)
-write("mypassword123")
-click(450, 550)        # Checkbox left side of the text "Accept terms" (normalized coordinates)
-```<end_code>
-Step 3:
-Short term goal: I need to scroll down to see the "Sign Up" button.
-What I see: The form fields are filled, but I cannot see the "Sign Up" button - it's likely below the current view.
-Reflection: I will scroll down to bring the submit button into view so I can click it in the next step.
-Action:
-```python
-scroll(500, 500, "down", 3)
-```<end_code>
-Step 4:
-Short term goal: I want to submit the form.
-What I see: The "Sign Up" button is at the bottom center, around 520, 650 in normalized coordinates.
-Reflection: I will click the button to submit.
-Action:
-```python
-click(520, 650)
-wait(3)
-```<end_code>
-Step 5:
-Short term goal: Verify signup completed.
-What I see: A confirmation page "Welcome [email protected]".
-Reflection: Task complete.
-Action:
-```python
-final_answer("Signup completed")
-```<end_code>
-</task_resolution_example>
-<general_guidelines>
-# GUI Agent Guidelines for Web Forms (0-1000 Coordinates)
-## Environment Overview
-Ubuntu 22.04 XFCE4 desktop with Google Chrome/Chromium browser.
-Agent can fill forms, sign up, sign in, click checkboxes, submit claims.
-**All coordinates are normalized between 0 and 1000.**
-## Core Principles
-### 1. Screenshot Analysis
-- Always analyze the latest screenshot carefully before each action.
-- Validate that previous actions worked by examining the current state.
-- If an action didn't work, try an alternative rather than repeating blindly.
-### 2. Action Execution
-- Execute one or multiple actions at a time (grouped in one code block).
-- Wait for appropriate loading times using `wait()` but not indefinitely.
-- Scroll to bring hidden elements into view.
-### 3. Coordinate System
-- **CRITICAL**: Always use normalized coordinates (0 to 1000)
-- Convert visual position on screen to normalized coordinates
-- Center of screen = (500, 500)
-- Top-left = (0, 0), Bottom-right = (1000, 1000)
-### 4. Keyboard Shortcuts
-- Use `tab` to move between fields, `space` to toggle checkboxes, `enter` to submit forms.
-- Copy/paste: `ctrl+C`, `ctrl+V`.
-- Refresh page: `refresh()`.
-### 5. Error Recovery
-- If clicking doesn't work, try double_click or right_click.
-- If typing doesn't appear, ensure the field is focused with click.
-- If popups block the screen, try `press("enter")` or `press("escape")`.
-### 6. Security & Privacy
-- Don't attempt to bypass captchas or 2FA automatically.
-- Don't store credentials in plain text unless instructed.
-### 7. Final Answer
-- When the form is successfully submitted or the goal achieved, use:
-```python
-final_answer("Done")
-```<end_code>
-</general_guidelines>
-"""
-class NormalizedCoordinatesSystemPrompt(Enum):
-    """Normalized coordinates system prompt"""
-    FORM_SYSTEM_PROMPT = """You are a web form automation assistant that can control a remote desktop environment with a web browser open.
-The current date is <<current_date>>.
-<action_process>
-You will be given a task to complete in several steps (e.g. filling forms, signing up, logging in, submitting claims).
-At each step you will perform **one action**.
-After each action, you will receive an updated screenshot.
-Then you will proceed as follows, with these sections — do not skip any:
-Short term goal: ...
-What I see: ...
-Reflection: ...
-Action:
-```python
-tool_name(arguments)
-```<end_code>
-Always format your Action section as **Python code blocks** exactly as shown above.
-</action_process>
-<tools>
-On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the desktop, no additional ones:
-{%- for tool in tools.values() %}
-- {{ tool.name }}: {{ tool.description }}
-    Takes inputs: {{tool.inputs}}
-    Returns an output of type: {{tool.output_type}}
-{%- endfor %}
-</tools>
-<coordinate_system>
-**IMPORTANT: This system uses NORMALIZED COORDINATES (0.0 to 1.0)**
-You must use normalized coordinates:
-- **x-coordinate**: 0.0 = left edge, 1.0 = right edge of screen
-- **y-coordinate**: 0.0 = top edge, 1.0 = bottom edge of screen
-- **Example**: Center of screen is (0.5, 0.5)
-- **Example**: Top-left corner is (0.0, 0.0)
-- **Example**: Bottom-right corner is (1.0, 1.0)
-When you see an element on the screenshot:
-1. Estimate its position relative to the screen dimensions
-2. Convert to normalized coordinates between 0.0 and 1.0
-3. Use these normalized coordinates in your tool calls
-**Never use pixel coordinates directly - always use normalized coordinates between 0.0 and 1.0**
-</coordinate_system>
-<web_form_guidelines>
-Look at the elements on the screen (input fields, checkboxes, buttons, dropdowns) to decide where to interact.
-**Always use normalized coordinates (0.0 to 1.0) based on the element's relative position on the screen.**
-### Typical Web Form Interactions
-- **Input fields**: click in the field first to focus it, then use `write("text")`.
-- **Passwords**: type them just like text — `write("password123")`.
-- **Checkboxes / radio buttons**: use `click(x,y)` directly on the box/circle.
-- **Dropdown menus**: click to open, then click the option or use arrow keys + press("enter").
-- **Submit buttons**: identify clearly labelled "Sign up", "Sign in", "Submit" buttons and click at their normalized coordinates.
-- **Captcha or 2FA**: wait or prompt for external handling (do not bypass security).
-- **Pop-ups**: try `press("enter")` to confirm or `press("escape")` to close if they block your action.
-### Grouping Multiple Inputs
-- If multiple fields, checkboxes, or similar controls are clearly visible **and** can be filled/clicked in sequence without ambiguity, you may include **several actions in one code block**.
-- Keep each Action still in a **Python code block** but with multiple tool calls inside, for example:
-```python
-click(0.47, 0.30)        # Email field (normalized coordinates)
-wait(0.1)
-write("[email protected]")
-click(0.47, 0.35)        # Password field (normalized coordinates)
-wait(0.1)
-write("mypassword123")
-click(0.45, 0.55)        # Checkbox "Accept terms" (normalized coordinates)
-wait(0.1)
-```<end_code>
-- Only group actions when:
-  1. They're all part of the **same form or step**,
-  2. The screenshot clearly shows all elements and coordinates,
-  3. The order of operations is obvious.
-- Otherwise, default back to one Action per step.
-### Precision
-- Always **click before typing** to ensure the right field is active.
-- Always **scroll if needed** to bring elements into view before clicking.
-- Always **validate each action** via the screenshot before continuing.
-- Always use **normalized coordinates between 0.0 and 1.0**.
-</web_form_guidelines>
-<task_resolution_example>
-For a task like "Sign up for an account and submit the form":
-Step 1:
-Short term goal: I want to open the signup page.
-What I see: The browser is open on the homepage.
-Reflection: I will open the signup URL directly.
-Action:
-```python
-open("https://example.com/signup")
-wait(3)
-```<end_code>
-Step 2:
-Short term goal: I want to fill the "Email" field.
-What I see: I see the signup form with an "Email" field roughly in the center-left of the screen.
-Reflection: I will click inside the field (approximately 0.47, 0.30 in normalized coordinates) then type my email.
-Action:
-```python
-click(0.47, 0.30)
-write("[email protected]")
-```<end_code>
-Step 3:
-Short term goal: I want to check the "I accept terms" checkbox.
-What I see: The checkbox is in the lower portion of the form, around 0.45, 0.55 in normalized coordinates.
-Reflection: I will click it.
-Action:
-```python
-click(0.45, 0.55)
-```<end_code>
-Step 4:
-Short term goal: I want to submit the form.
-What I see: The "Sign Up" button is at the bottom center, around 0.52, 0.65 in normalized coordinates.
-Reflection: I will click the button to submit.
-Action:
-```python
-click(0.52, 0.65)
-wait(3)
-```<end_code>
-Step 5:
-Short term goal: Verify signup completed.
-What I see: A confirmation page "Welcome [email protected]".
-Reflection: Task complete.
-Action:
-```python
-final_answer("Signup completed")
-```<end_code>
-</task_resolution_example>
-<general_guidelines>
-# GUI Agent Guidelines for Web Forms (Normalized Coordinates)
-## Environment Overview
-Ubuntu 22.04 XFCE4 desktop with Google Chrome/Chromium browser.
-Agent can fill forms, sign up, sign in, click checkboxes, submit claims.
-**All coordinates are normalized between 0.0 and 1.0.**
-## Core Principles
-### 1. Screenshot Analysis
-- Always analyze the latest screenshot carefully before each action.
-- Validate that previous actions worked by examining the current state.
-- If an action didn't work, try an alternative rather than repeating blindly.
-### 2. Action Execution
-- Execute one action at a time.
-- Wait for appropriate loading times using `wait()` but not indefinitely.
-- Scroll to bring hidden elements into view.
-### 3. Coordinate System
-- **CRITICAL**: Always use normalized coordinates (0.0 to 1.0)
-- Convert visual position on screen to normalized coordinates
-- Center of screen = (0.5, 0.5)
-- Top-left = (0.0, 0.0), Bottom-right = (1.0, 1.0)
-### 4. Keyboard Shortcuts
-- Use `tab` to move between fields, `space` to toggle checkboxes, `enter` to submit forms.
-- Copy/paste: `ctrl+C`, `ctrl+V`.
-- Refresh page: `refresh()`.
-### 5. Error Recovery
-- If clicking doesn't work, try double_click or right_click.
-- If typing doesn't appear, ensure the field is focused with click.
-- If popups block the screen, try `press("enter")` or `press("escape")`.
-### 6. Security & Privacy
-- Don't attempt to bypass captchas or 2FA automatically.
-- Don't store credentials in plain text unless instructed.
-### 7. Final Answer
-- When the form is successfully submitted or the goal achieved, use:
-```python
-final_answer("Done")
-```<end_code>
-</general_guidelines>
-"""

cua2-core/src/cua2-core/services/models/anthropic.py DELETED Viewed

@@ -1,10 +0,0 @@
-from smolagents import LiteLLMModel
-class AnthropicModel(LiteLLMModel):
-    """Anthropic model"""
-    MODEL_TYPE = "anthropic"
-    def __init__(self, model_id: str):
-        super().__init__(model_id=model_id)

cua2-core/src/cua2-core/services/models/gemini.py DELETED Viewed

File without changes

cua2-core/src/cua2-core/services/models/get_model.py DELETED Viewed

@@ -1,12 +0,0 @@
-from smolagents import Model
-from backend.models.models import AgentType
-from backend.services.models.anthropic import AnthropicModel
-def get_model(model_id: str) -> tuple[Model, AgentType]:
-    """Get the model"""
-    if "sonnet" in model_id:
-        return AnthropicModel(model_id=model_id), AgentType.PIXEL_COORDINATES
-    else:
-        raise ValueError(f"Model {model_id} not found")

cua2-core/src/cua2-core/services/models/qwen.py DELETED Viewed

File without changes

cua2-core/src/{cua2-core → cua2_core}/__init__.py RENAMED Viewed

File without changes

cua2-core/src/{cua2-core → cua2_core}/app.py RENAMED Viewed

@@ -4,8 +4,8 @@ from dotenv import load_dotenv
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
-from backend.services.agent_service import AgentService
-from backend.websocket.websocket_manager import WebSocketManager
 # Load environment variables
 load_dotenv()

 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
+from cua2_core.services.agent_service import AgentService
+from cua2_core.websocket.websocket_manager import WebSocketManager
 # Load environment variables
 load_dotenv()

cua2-core/src/{cua2-core → cua2_core}/main.py RENAMED Viewed

@@ -1,10 +1,9 @@
 import os
 import uvicorn
-from backend.app import app
-from backend.routes.routes import router
-from backend.routes.websocket import router as websocket_router
 # Include routes
 app.include_router(router, prefix="/api/v1")
@@ -14,7 +13,7 @@ app.include_router(websocket_router)
 # Health check endpoint (without prefix)
 @app.get("/health")
 async def health():
-    return {"status": "healthy", "service": "computer-use-studio-backend"}
 if __name__ == "__main__":
@@ -29,9 +28,10 @@ if __name__ == "__main__":
     print(f"WebSocket endpoint: ws://{host}:{port}/ws")
     uvicorn.run(
-        "backend.app:app",
         host=host,
         port=port,
-        reload=debug,
         log_level="info" if not debug else "debug",
     )

 import os
 import uvicorn
+from cua2_core.app import app
+from cua2_core.routes.routes import router
+from cua2_core.routes.websocket import router as websocket_router
 # Include routes
 app.include_router(router, prefix="/api/v1")
 # Health check endpoint (without prefix)
 @app.get("/health")
 async def health():
+    return {"status": "healthy", "service": "cua2-core"}
 if __name__ == "__main__":
     print(f"WebSocket endpoint: ws://{host}:{port}/ws")
     uvicorn.run(
+        "cua2_core.app:app",
         host=host,
         port=port,
+        # reload=debug,
+        reload=True,
         log_level="info" if not debug else "debug",
     )

cua2-core/src/cua2_core/models/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ """Models module for CUA2 Core"""
2	+

cua2-core/src/cua2_core/models/models.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import json
+import os
+from datetime import datetime
+from enum import Enum
+from typing import Annotated, Literal, TypeAlias
+from pydantic import BaseModel, Field, field_serializer, model_validator
+#################### Backend -> Frontend ########################
+class AgentAction(BaseModel):
+    """Agent action structure"""
+    actionType: Literal["click", "write", "press", "scroll", "wait", "open", "launch_app", "refresh", "go_back"]
+    actionArguments: dict
+    def to_string(self) -> str:
+        """Convert action to a human-readable string"""
+        action_type = self.actionType
+        args = self.actionArguments
+        if action_type == "click":
+            x = args.get("x", "?")
+            y = args.get("y", "?")
+            return f"Click at coordinates ({x}, {y})"
+        elif action_type == "write":
+            text = args.get("text", "")
+            return f"Type text: '{text}'"
+        elif action_type == "press":
+            key = args.get("key", "")
+            return f"Press key: {key}"
+        elif action_type == "scroll":
+            direction = args.get("direction", "down")
+            amount = args.get("amount", 2)
+            return f"Scroll {direction} by {amount}"
+        elif action_type == "wait":
+            seconds = args.get("seconds", 0)
+            return f"Wait for {seconds} seconds"
+        elif action_type == "open":
+            file_or_url = args.get("file_or_url", "")
+            return f"Open: {file_or_url}"
+        elif action_type == "launch_app":
+            app_name = args.get("app_name", "")
+            return f"Launch app: {app_name}"
+        elif action_type == "refresh":
+            return "Refresh the current page"
+        elif action_type == "go_back":
+            return "Go back one page"
+class AgentStep(BaseModel):
+    """Agent step structure"""
+    traceId: str
+    stepId: str
+    image: str
+    thought: str
+    actions: list[AgentAction]
+    timeTaken: float
+    inputTokensUsed: int
+    outputTokensUsed: int
+    timestamp: datetime
+    step_evaluation: Literal['like', 'dislike', 'neutral']
+    @field_serializer('actions')
+    def serialize_actions(self, actions: list[AgentAction], _info):
+        """Convert actions to list of strings when dumping (controlled by context)"""
+        if _info.context and _info.context.get('actions_as_json', False):
+            return [action.model_dump(mode="json") for action in actions]
+        return [action.to_string() for action in actions]
+class AgentTraceMetadata(BaseModel):
+    """Metadata for agent execution"""
+    traceId: str = ""
+    inputTokensUsed: int = 0
+    outputTokensUsed: int = 0
+    timeTaken: float = 0.0  # in seconds
+    numberOfSteps: int = 0
+class AgentTrace(BaseModel):
+    """Agent message structure"""
+    id: str
+    timestamp: datetime
+    instruction: str
+    modelId: str
+    isRunning: bool
+    steps: list[AgentStep] = []
+    traceMetadata: AgentTraceMetadata = AgentTraceMetadata()
+    @model_validator(mode="after")
+    def validate_trace(self):
+        """Validate trace"""
+        if not self.steps:
+            self.steps = []
+        if not self.traceMetadata:
+            self.traceMetadata = AgentTraceMetadata()
+        return self
+#################### WebSocket Events ########################
+class AgentStartEvent(BaseModel):
+    """Agent start event"""
+    type: Literal["agent_start"] = "agent_start"
+    agentTrace: AgentTrace
+class AgentProgressEvent(BaseModel):
+    """Agent progress event"""
+    type: Literal["agent_progress"] = "agent_progress"
+    agentStep: AgentStep
+    traceMetadata: AgentTraceMetadata
+class AgentCompleteEvent(BaseModel):
+    """Agent complete event"""
+    type: Literal["agent_complete"] = "agent_complete"
+    traceMetadata: AgentTraceMetadata
+class AgentErrorEvent(BaseModel):
+    """Agent error event"""
+    type: Literal["agent_error"] = "agent_error"
+    error: str
+class VncUrlSetEvent(BaseModel):
+    """Vnc url set event"""
+    type: Literal["vnc_url_set"] = "vnc_url_set"
+    vncUrl: str
+class VncUrlUnsetEvent(BaseModel):
+    """Vnc url unset event"""
+    type: Literal["vnc_url_unset"] = "vnc_url_unset"
+class HeartbeatEvent(BaseModel):
+    """Heartbeat event"""
+    type: Literal["heartbeat"] = "heartbeat"
+WebSocketEvent: TypeAlias = Annotated[
+    AgentStartEvent
+    | AgentProgressEvent
+    | AgentCompleteEvent
+    | AgentErrorEvent
+    | VncUrlSetEvent
+    | VncUrlUnsetEvent
+    | HeartbeatEvent,
+    Field(discriminator="type"),
+]
+#################### Frontend -> Backend ########################
+class UserTaskMessage(BaseModel):
+    """Message sent from frontend to backend"""
+    event_type: Literal["user_task"]
+    agent_trace: AgentTrace | None = None
+##################### Agent Service ########################
+class ActiveTask(BaseModel):
+    """Active task"""
+    message_id: str
+    instruction: str
+    modelId: str
+    timestamp: datetime = datetime.now()
+    steps: list[AgentStep] = []
+    traceMetadata: AgentTraceMetadata = AgentTraceMetadata()
+    @property
+    def trace_path(self):
+        """Trace path"""
+        return f"data/trace-{self.message_id}-{self.modelId}"
+    @model_validator(mode="after")
+    def store_model(self):
+        """Validate model ID"""
+        self.traceMetadata.traceId = self.message_id
+        os.makedirs(self.trace_path, exist_ok=True)
+        with open(f"{self.trace_path}/tasks.json", "w") as f:
+            json.dump(self.model_dump(mode="json", context={"actions_as_json": True}), f, indent=2)
+        return self
+class HealthResponse(BaseModel):
+    """Health check response"""
+    status: str
+    timestamp: datetime
+    websocket_connections: int

cua2-core/src/cua2_core/routes/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ """Routes module for CUA2 Core"""
2	+

cua2-core/src/{cua2-core → cua2_core}/routes/routes.py RENAMED Viewed

@@ -3,9 +3,9 @@ from datetime import datetime
 from fastapi import APIRouter, Depends, HTTPException, Request
 # Get services from app state
-from backend.models.models import HealthResponse
-from backend.services.agent_service import AgentService
-from backend.websocket.websocket_manager import WebSocketManager
 # Create router
 router = APIRouter()

 from fastapi import APIRouter, Depends, HTTPException, Request
 # Get services from app state
+from cua2_core.models.models import HealthResponse
+from cua2_core.services.agent_service import AgentService
+from cua2_core.websocket.websocket_manager import WebSocketManager
 # Create router
 router = APIRouter()

cua2-core/src/{cua2-core → cua2_core}/routes/websocket.py RENAMED Viewed

@@ -3,8 +3,8 @@ import json
 from fastapi import APIRouter, WebSocket, WebSocketDisconnect
 # Get services from app state
-from backend.app import app
-from backend.models.models import UserTaskMessage, WebSocketEvent
 # Create router
 router = APIRouter()
@@ -20,11 +20,8 @@ async def websocket_endpoint(websocket: WebSocket):
     await websocket_manager.connect(websocket)
     try:
-        welcome_message = WebSocketEvent(
-            type="heartbeat",
-            content="WebSocket connection established successfully",
-            messageId="connection_welcome",
-        )
         await websocket_manager.send_personal_message(welcome_message, websocket)
         # Keep the connection alive and wait for messages
@@ -36,27 +33,32 @@ async def websocket_endpoint(websocket: WebSocket):
                 try:
                     # Parse the message
                     message_data = json.loads(data)
-                    message = UserTaskMessage(**message_data)
-                    # Process the user task
-                    if message.type == "user_task":
-                        message_id = await agent_service.process_user_task(
-                            message.content, message.model_id
-                        )
-                        # Send acknowledgment back to the client
-                        response = WebSocketEvent(
-                            type="agent_start",
-                            content=f"Received task: {message.content}",
-                            messageId=message_id,
-                        )
-                        await websocket_manager.send_personal_message(
-                            response, websocket
-                        )
-                except json.JSONDecodeError:
-                    error_response = WebSocketEvent(
-                        type="agent_error", content="Invalid JSON format"
                     )
                     await websocket_manager.send_personal_message(
                         error_response, websocket
@@ -64,9 +66,12 @@ async def websocket_endpoint(websocket: WebSocket):
                 except Exception as e:
                     print(f"Error processing message: {e}")
-                    error_response = WebSocketEvent(
                         type="agent_error",
-                        content=f"Error processing message: {str(e)}",
                     )
                     await websocket_manager.send_personal_message(
                         error_response, websocket

 from fastapi import APIRouter, WebSocket, WebSocketDisconnect
 # Get services from app state
+from cua2_core.app import app
+from cua2_core.models.models import UserTaskMessage, AgentTrace, HeartbeatEvent
 # Create router
 router = APIRouter()
     await websocket_manager.connect(websocket)
     try:
+        # Send welcome heartbeat
+        welcome_message = HeartbeatEvent(type="heartbeat")
         await websocket_manager.send_personal_message(welcome_message, websocket)
         # Keep the connection alive and wait for messages
                 try:
                     # Parse the message
                     message_data = json.loads(data)
+                    print(f"Received message: {message_data}")
+                    # Check if it's a user task message
+                    if message_data.get("type") == "user_task":
+                        # Extract and parse the trace
+                        trace_data = message_data.get("trace")
+                        if trace_data:
+                            # Convert timestamp string to datetime if needed
+                            if isinstance(trace_data.get("timestamp"), str):
+                                from datetime import datetime
+                                trace_data["timestamp"] = datetime.fromisoformat(trace_data["timestamp"].replace("Z", "+00:00"))
+                            trace = AgentTrace(**trace_data)
+                            # Process the user task with the trace
+                            trace_id = await agent_service.process_user_task(trace)
+                            print(f"Started processing trace: {trace_id}")
+                        else:
+                            print("No trace data in message")
+                except json.JSONDecodeError as e:
+                    print(f"JSON decode error: {e}")
+                    from cua2_core.models.models import AgentErrorEvent
+                    error_response = AgentErrorEvent(
+                        type="agent_error",
+                        error="Invalid JSON format"
                     )
                     await websocket_manager.send_personal_message(
                         error_response, websocket
                 except Exception as e:
                     print(f"Error processing message: {e}")
+                    import traceback
+                    traceback.print_exc()
+                    from cua2_core.models.models import AgentErrorEvent
+                    error_response = AgentErrorEvent(
                         type="agent_error",
+                        error=f"Error processing message: {str(e)}"
                     )
                     await websocket_manager.send_personal_message(
                         error_response, websocket

cua2-core/src/cua2_core/services/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ """Services module for CUA2 Core"""
2	+

cua2-core/src/cua2_core/services/agent_service.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import asyncio
+import json
+import base64
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+from cua2_core.models.models import (
+    ActiveTask,
+    AgentTrace,
+    AgentStep,
+    AgentAction,
+    AgentTraceMetadata,
+    AgentStartEvent,
+    AgentProgressEvent,
+    AgentCompleteEvent,
+    AgentErrorEvent,
+    VncUrlSetEvent,
+    VncUrlUnsetEvent,
+)
+from cua2_core.websocket.websocket_manager import WebSocketManager
+class AgentService:
+    """Service for handling agent tasks and processing"""
+    def __init__(self, websocket_manager):
+        self.active_tasks: dict[str, ActiveTask] = {}
+        self.websocket_manager: WebSocketManager = websocket_manager
+        self.simulation_data_path = Path(__file__).parent / "simulation_metadata" / "simulated_trace.json"
+        self.simulation_images_path = Path(__file__).parent / "simulation_metadata" / "images"
+    async def process_user_task(self, trace: AgentTrace) -> str:
+        """Process a user task and return the trace ID"""
+        trace_id = trace.id
+        trace.steps = []
+        trace.traceMetadata = AgentTraceMetadata(traceId=trace_id)
+        # Store the task
+        self.active_tasks[trace_id] = ActiveTask(
+            message_id=trace_id,
+            instruction=trace.instruction,
+            modelId=trace.modelId,
+            timestamp=trace.timestamp,
+            steps=trace.steps,
+            traceMetadata=trace.traceMetadata,
+        )
+        # Start the agent processing in the background
+        asyncio.create_task(
+            self._simulate_agent_processing(trace)
+        )
+        return trace_id
+    async def _simulate_agent_processing(self, trace: AgentTrace):
+        """Simulate agent processing using simulated_trace.json data"""
+        trace_id = trace.id
+        try:
+            # Load simulation data
+            with open(self.simulation_data_path, 'r') as f:
+                simulation_data = json.load(f)
+            # Send agent start event with the initial trace
+            start_event = AgentStartEvent(
+                type="agent_start",
+                agentTrace=trace
+            )
+            await self.websocket_manager.broadcast(start_event)
+            # mock VNC URL
+            vnc_url = "https://www.youtube.com/embed/VCutEsRSJ5A?si=PT0ETJ7zIJ9ywhGW"
+            vnc_set_event = VncUrlSetEvent(
+                type="vnc_url_set",
+                vncUrl=vnc_url
+            )
+            await self.websocket_manager.broadcast(vnc_set_event)
+            trace_metadata = AgentTraceMetadata(traceId=trace_id)
+            # Process each step from the simulation data
+            for step_data in simulation_data["steps"]:
+                # Wait before sending the next step to simulate processing time
+                await asyncio.sleep(step_data["timeTaken"])
+                # Load and encode the image
+                image_path = self.simulation_images_path / step_data["image"].split("/")[-1]
+                with open(image_path, 'rb') as img_file:
+                    image_bytes = img_file.read()
+                    image_base64 = f"data:image/png;base64,{base64.b64encode(image_bytes).decode('utf-8')}"
+                # Convert actions to AgentAction objects
+                actions = [
+                    AgentAction(
+                        actionType=action["actionType"],
+                        actionArguments=action["actionArguments"]
+                    )
+                    for action in step_data["actions"]
+                ]
+                # Create agent step
+                agent_step = AgentStep(
+                    traceId=trace_id,
+                    stepId=step_data["stepId"],
+                    image=image_base64,
+                    thought=step_data["thought"],
+                    actions=actions,
+                    timeTaken=step_data["timeTaken"],
+                    inputTokensUsed=step_data["inputTokensUsed"],
+                    outputTokensUsed=step_data["outputTokensUsed"],
+                    timestamp=datetime.fromisoformat(step_data["timestamp"].replace("Z", "+00:00")),
+                    step_evaluation=step_data["step_evaluation"]
+                )
+                trace_metadata.numberOfSteps += 1
+                trace_metadata.timeTaken += step_data["timeTaken"]
+                trace_metadata.inputTokensUsed += step_data["inputTokensUsed"]
+                trace_metadata.outputTokensUsed += step_data["outputTokensUsed"]
+                # Send progress event
+                progress_event = AgentProgressEvent(
+                    type="agent_progress",
+                    agentStep=agent_step,
+                    traceMetadata=trace_metadata
+                )
+                await self.websocket_manager.broadcast(progress_event)
+                # Update active task
+                self.active_tasks[trace_id].steps.append(agent_step)
+            # Unset VNC URL before completion
+            vnc_unset_event = VncUrlUnsetEvent(type="vnc_url_unset")
+            await self.websocket_manager.broadcast(vnc_unset_event)
+            # Send completion event
+            complete_event = AgentCompleteEvent(
+                type="agent_complete",
+                traceMetadata=trace_metadata
+            )
+            await self.websocket_manager.broadcast(complete_event)
+            # Update active task with final metadata
+            self.active_tasks[trace_id].traceMetadata = trace_metadata
+            # Clean up after a delay
+            await asyncio.sleep(1)
+            if trace_id in self.active_tasks:
+                del self.active_tasks[trace_id]
+        except Exception as e:
+            print(f"Error in agent simulation: {str(e)}")
+            # Send error event
+            error_event = AgentErrorEvent(
+                type="agent_error",
+                error=f"Error processing task: {str(e)}"
+            )
+            await self.websocket_manager.broadcast(error_event)
+            # Clean up
+            if trace_id in self.active_tasks:
+                del self.active_tasks[trace_id]
+    def get_active_tasks(self) -> dict:
+        """Get currently active tasks"""
+        return self.active_tasks.copy()
+    def get_task_status(self, message_id: str) -> Optional[dict]:
+        """Get status of a specific task"""
+        return self.active_tasks.get(message_id)

cua2-core/src/cua2_core/services/simulation_metadata/simulated_trace.json ADDED Viewed

	@@ -0,0 +1,62 @@

+{
+  "steps": [
+    {
+      "stepId": "step_001",
+      "image": "images/step_1.png",
+      "thought": "I can see a form with multiple input fields. I need to start by clicking on the first name field to begin filling out the form.",
+      "actions": [
+        {
+          "actionType": "click",
+          "actionArguments": {
+            "x": 320,
+            "y": 180
+          }
+        }
+      ],
+      "timeTaken": 2.3,
+      "inputTokensUsed": 1250,
+      "outputTokensUsed": 85,
+      "timestamp": "2025-10-17T14:30:02.300Z",
+      "step_evaluation": "like"
+    },
+    {
+      "stepId": "step_002",
+      "image": "images/step_2.png",
+      "thought": "After clicking the first field, I can see the cursor is active. Now I should proceed to click on the email field to continue with the form submission process.",
+      "actions": [
+        {
+          "actionType": "click",
+          "actionArguments": {
+            "x": 420,
+            "y": 285
+          }
+        }
+      ],
+      "timeTaken": 1.8,
+      "inputTokensUsed": 1180,
+      "outputTokensUsed": 72,
+      "timestamp": "2025-10-17T14:30:04.100Z",
+      "step_evaluation": "like"
+    },
+    {
+      "stepId": "step_003",
+      "image": "images/step_3.png",
+      "thought": "The form appears to be mostly filled. I can see a submit button at the bottom of the form. I'll click on it to complete the form submission.",
+      "actions": [
+        {
+          "actionType": "click",
+          "actionArguments": {
+            "x": 450,
+            "y": 520
+          }
+        }
+      ],
+      "timeTaken": 1.5,
+      "inputTokensUsed": 1100,
+      "outputTokensUsed": 68,
+      "timestamp": "2025-10-17T14:30:05.600Z",
+      "step_evaluation": "like"
+    }
+  ]
+}

cua2-core/src/cua2_core/websocket/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ """WebSocket module for CUA2 Core"""
2	+

cua2-core/src/{cua2-core → cua2_core}/websocket/websocket_manager.py RENAMED Viewed

@@ -4,7 +4,7 @@ from typing import Dict, Optional, Set
 from fastapi import WebSocket
-from backend.models.models import AgentMetadata, WebSocketEvent
 class WebSocketManager:
@@ -35,7 +35,7 @@ class WebSocketManager:
     ):
         """Send a message to a specific WebSocket connection"""
         try:
-            await websocket.send_text(json.dumps(message.model_dump()))
         except Exception as e:
             print(f"Error sending personal message: {e}")
             # Only disconnect if the connection is still in our set
@@ -52,7 +52,7 @@ class WebSocketManager:
         for connection in self.active_connections.copy():
             try:
-                await connection.send_text(json.dumps(message.model_dump()))
             except Exception as e:
                 print(f"Error broadcasting to connection: {e}")
                 disconnected.append(connection)
@@ -77,7 +77,7 @@ class WebSocketManager:
         await self.broadcast(event)
     async def send_agent_complete(
-        self, content: str, message_id: str, metadata: Optional[AgentMetadata] = None
     ):
         """Send agent complete event"""
         event = WebSocketEvent(

 from fastapi import WebSocket
+from cua2_core.models.models import AgentTraceMetadata, WebSocketEvent
 class WebSocketManager:
     ):
         """Send a message to a specific WebSocket connection"""
         try:
+            await websocket.send_text(json.dumps(message.model_dump(mode="json")))
         except Exception as e:
             print(f"Error sending personal message: {e}")
             # Only disconnect if the connection is still in our set
         for connection in self.active_connections.copy():
             try:
+                await connection.send_text(json.dumps(message.model_dump(mode="json")))
             except Exception as e:
                 print(f"Error broadcasting to connection: {e}")
                 disconnected.append(connection)
         await self.broadcast(event)
     async def send_agent_complete(
+        self, content: str, message_id: str, metadata: Optional[AgentTraceMetadata] = None
     ):
         """Send agent complete event"""
         event = WebSocketEvent(