diff --git a/.gitignore b/.gitignore
index 2d83410f..142b465a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -184,3 +184,17 @@ AgentHistoryList.json
# For Docker
data/
+
+# cursor
+.cursorrules
+.cursorignore
+.backup.env
+.brain/** */
+
+# Brain directory
+.brain/
+
+.env.google
+.zip
+traces/
+debug-session/
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 8b09300d..58dcb3c6 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -7,5 +7,11 @@
"source.fixAll.ruff": "explicit",
"source.organizeImports.ruff": "explicit"
}
+ },
+ "dotenv.enableAutocloaking": false,
+ "workbench.colorCustomizations": {
+ "activityBar.background": "#452606",
+ "titleBar.activeBackground": "#603608",
+ "titleBar.activeForeground": "#FEFBF7"
}
}
diff --git a/README.md b/README.md
index 184eeb93..698b00de 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,79 @@
+# Fork Purpose
+
+This fork of browser-use/web-ui adds CLI support specifically designed for AI agents like Cursor Agent. It enables direct command-line interaction with browser automation tasks, making it ideal for integration with AI development environments and automated workflows.
+
+## CLI Documentation
+
+See [CLI Guide](cli/README.md) for comprehensive documentation on:
+- Available LLM providers and models
+- Detailed command reference
+- Environment configuration
+- Example usage patterns
+
+### Quick Start
+
+```bash
+# Run a task (browser will auto-start if needed)
+browser-use run "go to example.com and create a report about the page structure"
+
+# Run with specific provider and vision capabilities
+browser-use run "analyze the layout and visual elements" --provider Google --vision
+
+# Run with specific model selection
+browser-use run "analyze the page" --provider Anthropic --model-index 1
+
+# Explicitly start browser with custom options (optional)
+browser-use start --headless --window-size 1920x1080
+
+# Close browser when done
+browser-use close
+```
+
+### Supported LLM Providers
+
+- **OpenAI** (`gpt-4o`) - Vision-capable model for advanced analysis
+- **Anthropic** (`claude-3-5-sonnet-latest`, `claude-3-5-sonnet-20241022`) - Advanced language understanding
+- **Google** (`gemini-1.5-pro`, `gemini-2.0-flash`) - Fast and efficient processing
+- **DeepSeek** (`deepseek-chat`) - Cost-effective default option
+
+See the [CLI Guide](cli/README.md) for detailed provider configuration and usage examples.
+
+### CLI Commands
+
+- `start` - (Optional) Initialize browser session with custom options:
+ - `--headless` - Run in headless mode
+ - `--window-size` - Set window dimensions (e.g., "1920x1080")
+ - `--disable-security` - Disable security features
+ - `--user-data-dir` - Use custom Chrome profile
+ - `--proxy` - Set proxy server
+
+- `run` - Execute tasks (auto-starts browser if needed):
+ - `--model` - Choose LLM (deepseek-chat, gemini, gpt-4, claude-3)
+ - `--vision` - Enable visual analysis
+ - `--record` - Record browser session
+ - `--trace-path` - Save debugging traces
+ - `--max-steps` - Limit task steps
+ - `--add-info` - Provide additional context
+
+- `close` - Clean up browser session
+
+### Example Tasks
+
+The [browser-tasks-example.ts](cli/browser-tasks-example.ts) provides ready-to-use task sequences for:
+
+- Product research automation
+- Documentation analysis
+- Page structure analysis
+- Debug sessions with tracing
+
+### Configuration
+
+See [.env.example](.env.example) for all available configuration options, including:
+
+- API keys for different LLM providers
+- Browser settings
+- Session persistence options
+
@@ -51,134 +127,4 @@ Then install playwright:
```bash
playwright install
-```
-
-### Option 2: Docker Installation
-
-1. **Prerequisites:**
- - Docker and Docker Compose installed on your system
- - Git to clone the repository
-
-2. **Setup:**
- ```bash
- # Clone the repository
- git clone https://github.com/browser-use/web-ui.git
- cd web-ui
-
- # Copy and configure environment variables
- cp .env.example .env
- # Edit .env with your preferred text editor and add your API keys
- ```
-
-3. **Run with Docker:**
- ```bash
- # Build and start the container with default settings (browser closes after AI tasks)
- docker compose up --build
-
- # Or run with persistent browser (browser stays open between AI tasks)
- CHROME_PERSISTENT_SESSION=true docker compose up --build
- ```
-
-4. **Access the Application:**
- - WebUI: `http://localhost:7788`
- - VNC Viewer (to see browser interactions): `http://localhost:6080/vnc.html`
-
- Default VNC password is "vncpassword". You can change it by setting the `VNC_PASSWORD` environment variable in your `.env` file.
-
-
-## Usage
-
-### Local Setup
-1. Copy `.env.example` to `.env` and set your environment variables, including API keys for the LLM. `cp .env.example .env`
-2. **Run the WebUI:**
- ```bash
- python webui.py --ip 127.0.0.1 --port 7788
- ```
-4. WebUI options:
- - `--ip`: The IP address to bind the WebUI to. Default is `127.0.0.1`.
- - `--port`: The port to bind the WebUI to. Default is `7788`.
- - `--theme`: The theme for the user interface. Default is `Ocean`.
- - **Default**: The standard theme with a balanced design.
- - **Soft**: A gentle, muted color scheme for a relaxed viewing experience.
- - **Monochrome**: A grayscale theme with minimal color for simplicity and focus.
- - **Glass**: A sleek, semi-transparent design for a modern appearance.
- - **Origin**: A classic, retro-inspired theme for a nostalgic feel.
- - **Citrus**: A vibrant, citrus-inspired palette with bright and fresh colors.
- - **Ocean** (default): A blue, ocean-inspired theme providing a calming effect.
- - `--dark-mode`: Enables dark mode for the user interface.
-3. **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`.
-4. **Using Your Own Browser(Optional):**
- - Set `CHROME_PATH` to the executable path of your browser and `CHROME_USER_DATA` to the user data directory of your browser.
- - Windows
- ```env
- CHROME_PATH="C:\Program Files\Google\Chrome\Application\chrome.exe"
- CHROME_USER_DATA="C:\Users\YourUsername\AppData\Local\Google\Chrome\User Data"
- ```
- > Note: Replace `YourUsername` with your actual Windows username for Windows systems.
- - Mac
- ```env
- CHROME_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
- CHROME_USER_DATA="~/Library/Application Support/Google/Chrome/Profile 1"
- ```
- - Close all Chrome windows
- - Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent.
- - Check the "Use Own Browser" option within the Browser Settings.
-5. **Keep Browser Open(Optional):**
- - Set `CHROME_PERSISTENT_SESSION=true` in the `.env` file.
-
-### Docker Setup
-1. **Environment Variables:**
- - All configuration is done through the `.env` file
- - Available environment variables:
- ```
- # LLM API Keys
- OPENAI_API_KEY=your_key_here
- ANTHROPIC_API_KEY=your_key_here
- GOOGLE_API_KEY=your_key_here
-
- # Browser Settings
- CHROME_PERSISTENT_SESSION=true # Set to true to keep browser open between AI tasks
- RESOLUTION=1920x1080x24 # Custom resolution format: WIDTHxHEIGHTxDEPTH
- RESOLUTION_WIDTH=1920 # Custom width in pixels
- RESOLUTION_HEIGHT=1080 # Custom height in pixels
-
- # VNC Settings
- VNC_PASSWORD=your_vnc_password # Optional, defaults to "vncpassword"
- ```
-
-2. **Browser Persistence Modes:**
- - **Default Mode (CHROME_PERSISTENT_SESSION=false):**
- - Browser opens and closes with each AI task
- - Clean state for each interaction
- - Lower resource usage
-
- - **Persistent Mode (CHROME_PERSISTENT_SESSION=true):**
- - Browser stays open between AI tasks
- - Maintains history and state
- - Allows viewing previous AI interactions
- - Set in `.env` file or via environment variable when starting container
-
-3. **Viewing Browser Interactions:**
- - Access the noVNC viewer at `http://localhost:6080/vnc.html`
- - Enter the VNC password (default: "vncpassword" or what you set in VNC_PASSWORD)
- - You can now see all browser interactions in real-time
-
-4. **Container Management:**
- ```bash
- # Start with persistent browser
- CHROME_PERSISTENT_SESSION=true docker compose up -d
-
- # Start with default mode (browser closes after tasks)
- docker compose up -d
-
- # View logs
- docker compose logs -f
-
- # Stop the container
- docker compose down
- ```
-
-## Changelog
-
-- [x] **2025/01/10:** Thanks to @casistack. Now we have Docker Setup option and also Support keep browser open between tasks.[Video tutorial demo](https://github.com/browser-use/web-ui/issues/1#issuecomment-2582511750).
-- [x] **2025/01/06:** Thanks to @richard-devbot. A New and Well-Designed WebUI is released. [Video tutorial demo](https://github.com/warmshao/browser-use-webui/issues/1#issuecomment-2573393113).
\ No newline at end of file
+```
\ No newline at end of file
diff --git a/analyze_trace.py b/analyze_trace.py
new file mode 100644
index 00000000..a66a26b8
--- /dev/null
+++ b/analyze_trace.py
@@ -0,0 +1,11 @@
+from src.trace_analyzer import EnhancedTraceAnalyzer
+import asyncio
+import json
+
+async def main():
+ analyzer = EnhancedTraceAnalyzer('traces/enhanced-test.json')
+ result = await analyzer.analyze_all()
+ print(json.dumps(result, indent=2))
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/cli/README.md b/cli/README.md
new file mode 100644
index 00000000..03e7466c
--- /dev/null
+++ b/cli/README.md
@@ -0,0 +1,161 @@
+# Browser-Use CLI Guide
+
+This guide details the available models and commands for the browser-use CLI tool.
+
+## Available Models
+
+### OpenAI
+- Model: `gpt-4o` (Vision-capable)
+```bash
+# Basic usage
+browser-use run "analyze this webpage" --provider OpenAI
+
+# With vision capabilities
+browser-use run "describe what you see on the page" --provider OpenAI --vision
+```
+
+### Anthropic
+- Models:
+ - `claude-3-5-sonnet-latest` (Default)
+ - `claude-3-5-sonnet-20241022`
+```bash
+# Using default model
+browser-use run "analyze this webpage" --provider Anthropic
+
+# Using specific model version
+browser-use run "analyze this webpage" --provider Anthropic --model-index 1
+```
+
+### Google (Gemini)
+- Models:
+ - `gemini-1.5-pro` (Default)
+ - `gemini-2.0-flash`
+```bash
+# Using default model
+browser-use run "analyze this webpage" --provider Google
+
+# Using flash model
+browser-use run "analyze this webpage" --provider Google --model-index 1
+```
+
+### DeepSeek
+- Model: `deepseek-chat`
+```bash
+# DeepSeek is the default provider
+browser-use run "analyze this webpage"
+
+# Explicitly specifying DeepSeek
+browser-use run "analyze this webpage" --provider Deepseek
+```
+
+## CLI Commands
+
+### Start Browser Session
+```bash
+# Basic start
+browser-use start
+
+# With custom window size
+browser-use start --window-size 1920x1080
+
+# Headless mode
+browser-use start --headless
+
+# With custom Chrome profile
+browser-use start --user-data-dir "/path/to/profile"
+
+# With proxy
+browser-use start --proxy "localhost:8080"
+```
+
+### Run Tasks
+```bash
+# Basic task
+browser-use run "analyze the page" --url "https://example.com"
+
+# With vision capabilities
+browser-use run "describe the visual layout" --url "https://example.com" --vision
+
+# With specific provider and model
+browser-use run "analyze this webpage" --url "https://example.com" --provider Google --model-index 1
+
+# With recording
+browser-use run "test the checkout flow" --url "https://example.com/checkout" --record --record-path ./recordings
+
+# With debugging traces
+browser-use run "analyze form submission" --url "https://example.com/form" --trace-path ./traces
+
+# With step limits
+browser-use run "complex task" --url "https://example.com" --max-steps 5 --max-actions 2
+
+# With additional context
+browser-use run "analyze pricing" --url "https://example.com/pricing" --add-info "Focus on enterprise plans"
+```
+
+### Close Browser
+```bash
+browser-use close
+```
+
+## Environment Variables
+
+Required API keys should be set in your `.env` file:
+```env
+# OpenAI
+OPENAI_API_KEY=your_key_here
+OPENAI_ENDPOINT=https://api.openai.com/v1 # Optional
+
+# Anthropic
+ANTHROPIC_API_KEY=your_key_here
+
+# Google (Gemini)
+GOOGLE_API_KEY=your_key_here
+
+# DeepSeek
+DEEPSEEK_API_KEY=your_key_here
+DEEPSEEK_ENDPOINT=your_endpoint # Optional
+```
+
+## Browser Settings
+
+Optional browser configuration in `.env`:
+```env
+# Custom Chrome settings
+CHROME_PATH=/path/to/chrome
+CHROME_USER_DATA=/path/to/user/data
+
+# Session persistence
+CHROME_PERSISTENT_SESSION=true # Keep browser open between tasks
+```
+
+## Examples
+
+### Visual Analysis Task
+```bash
+browser-use run \
+ "analyze the page layout" \
+ --url "https://example.com" \
+ --provider Google \
+ --vision \
+ --record \
+ --record-path ./recordings
+```
+
+### Multi-Step Task
+```bash
+browser-use run \
+ "fill the form and verify success" \
+ --url "https://example.com/login" \
+ --provider Anthropic \
+ --max-steps 5 \
+ --trace-path ./traces/login
+```
+
+### Research Task
+```bash
+browser-use run \
+ "research pricing information for top 3 competitors" \
+ --url "https://example.com" \
+ --provider OpenAI \
+ --add-info "Focus on enterprise features and annual pricing"
+```
\ No newline at end of file
diff --git a/cli/__init__.py b/cli/__init__.py
new file mode 100644
index 00000000..d1f449a5
--- /dev/null
+++ b/cli/__init__.py
@@ -0,0 +1,3 @@
+"""
+Command-line interface for browser-use.
+"""
\ No newline at end of file
diff --git a/cli/browser-tasks-example.ts b/cli/browser-tasks-example.ts
new file mode 100644
index 00000000..3f39f97a
--- /dev/null
+++ b/cli/browser-tasks-example.ts
@@ -0,0 +1,287 @@
+/**
+ * Browser Automation Task Sequences
+ *
+ * This file defines task sequences for browser automation using the browser-use command.
+ * Each sequence represents a series of browser interactions that can be executed in order.
+ */
+
+export interface BrowserCommand {
+ prompt: string;
+ url: string;
+ provider?: 'Deepseek' | 'Google' | 'OpenAI' | 'Anthropic';
+ modelIndex?: number;
+ headless?: boolean;
+ vision?: boolean;
+ record?: boolean;
+ recordPath?: string;
+ tracePath?: string;
+ maxSteps?: number;
+ maxActions?: number;
+ addInfo?: string;
+ windowSize?: string;
+ userDataDir?: string;
+ proxy?: string;
+}
+
+export interface BrowserTask {
+ description: string;
+ command: BrowserCommand;
+ subtasks?: BrowserTask[];
+}
+
+export interface BrowserTaskSequence {
+ name: string;
+ description: string;
+ tasks: BrowserTask[];
+}
+
+// Example task sequences
+export const browserTasks: BrowserTaskSequence[] = [
+ {
+ name: "Product Research",
+ description: "Compare product prices across multiple e-commerce sites",
+ tasks: [
+ {
+ description: "Search Amazon for wireless earbuds",
+ command: {
+ prompt: "search for 'wireless earbuds' and tell me the price of the top 3 results",
+ url: "https://www.amazon.com",
+ provider: "Deepseek"
+ }
+ },
+ {
+ description: "Search Best Buy for comparison",
+ command: {
+ prompt: "search for 'wireless earbuds' and tell me the price of the top 3 results",
+ url: "https://www.bestbuy.com",
+ provider: "Deepseek"
+ }
+ },
+ {
+ description: "Create price comparison",
+ command: {
+ prompt: "create a comparison table of the prices from both sites",
+ url: "about:blank",
+ provider: "Deepseek"
+ }
+ }
+ ]
+ },
+ {
+ name: "Site Health Check",
+ description: "Monitor website availability and performance",
+ tasks: [
+ {
+ description: "Check main site",
+ command: {
+ prompt: "check if it loads properly",
+ url: "https://example.com",
+ provider: "Deepseek",
+ headless: true
+ }
+ },
+ {
+ description: "Verify API health",
+ command: {
+ prompt: "check the API health status",
+ url: "https://api.example.com/health",
+ provider: "Deepseek",
+ headless: true
+ }
+ },
+ {
+ description: "Test documentation site",
+ command: {
+ prompt: "verify all navigation links are working",
+ url: "https://docs.example.com",
+ provider: "Deepseek",
+ headless: true
+ }
+ }
+ ]
+ },
+ {
+ name: "Content Analysis",
+ description: "Analyze blog content and engagement",
+ tasks: [
+ {
+ description: "List articles",
+ command: {
+ prompt: "list all article titles from the homepage",
+ url: "https://blog.example.com",
+ provider: "Deepseek"
+ }
+ },
+ {
+ description: "Analyze first article",
+ command: {
+ prompt: "click on the first article and summarize its main points",
+ url: "https://blog.example.com",
+ provider: "Deepseek"
+ },
+ subtasks: [
+ {
+ description: "Get metadata",
+ command: {
+ prompt: "tell me the author, publication date, and reading time",
+ url: "https://blog.example.com",
+ provider: "Deepseek"
+ }
+ },
+ {
+ description: "Analyze comments",
+ command: {
+ prompt: "scroll to the comments section and summarize the main discussion points",
+ url: "https://blog.example.com",
+ provider: "Deepseek"
+ }
+ }
+ ]
+ }
+ ]
+ },
+ {
+ name: "Advanced Content Analysis",
+ description: "Analyze website content using different models for different tasks",
+ tasks: [
+ {
+ description: "Initial navigation and basic text extraction",
+ command: {
+ prompt: "navigate to the Actions documentation and extract basic text content",
+ url: "https://docs.github.com",
+ provider: "Deepseek"
+ }
+ },
+ {
+ description: "Visual analysis of page structure",
+ command: {
+ prompt: "analyze the layout of the page and tell me how the documentation is structured, including sidebars, navigation, and content areas",
+ url: "https://docs.github.com",
+ provider: "Google",
+ vision: true,
+ modelIndex: 1,
+ addInfo: "Only using Google here because we need vision capabilities"
+ }
+ },
+ {
+ description: "Complex content summarization",
+ command: {
+ prompt: "summarize the key concepts of GitHub Actions based on the documentation",
+ url: "https://docs.github.com",
+ provider: "Deepseek"
+ }
+ },
+ {
+ description: "Extract code examples",
+ command: {
+ prompt: "find and list all YAML workflow examples on the page",
+ url: "https://docs.github.com",
+ provider: "Deepseek"
+ }
+ }
+ ]
+ },
+ {
+ name: "Page Structure Analysis",
+ description: "Generate detailed reports about page structure and interactive elements",
+ tasks: [
+ {
+ description: "Analyze homepage structure",
+ command: {
+ prompt: "create a report about the page structure, including the page title, headings, and any interactive elements found",
+ url: "https://example.com",
+ provider: "Deepseek"
+ }
+ },
+ {
+ description: "Analyze navigation structure",
+ command: {
+ prompt: "focus on the navigation menu and create a detailed report of its structure and all available links",
+ url: "https://example.com",
+ provider: "Google",
+ vision: true,
+ addInfo: "Only using Google here because we need vision capabilities for complex layout analysis"
+ }
+ },
+ {
+ description: "Document forms and inputs",
+ command: {
+ prompt: "find all forms on the page and document their inputs, buttons, and validation requirements",
+ url: "https://example.com",
+ provider: "Google",
+ vision: true,
+ addInfo: "Only using Google here because we need vision capabilities for form analysis"
+ }
+ }
+ ]
+ },
+ {
+ name: "Debug Session",
+ description: "Record and analyze browser interactions for debugging",
+ tasks: [
+ {
+ description: "Start debug session",
+ command: {
+ prompt: "attempt to log in with test credentials",
+ url: "https://example.com/login",
+ provider: "Deepseek",
+ headless: false,
+ tracePath: "./tmp/traces/login",
+ record: true,
+ recordPath: "./recordings/login"
+ }
+ },
+ {
+ description: "Navigate complex workflow",
+ command: {
+ prompt: "complete the multi-step registration process",
+ url: "https://example.com/register",
+ provider: "Deepseek",
+ maxSteps: 5,
+ maxActions: 2,
+ tracePath: "./tmp/traces/registration"
+ }
+ },
+ {
+ description: "Generate debug report",
+ command: {
+ prompt: "create a report of all actions taken and any errors encountered",
+ url: "about:blank",
+ provider: "Deepseek",
+ addInfo: "Focus on error patterns and user interaction points"
+ }
+ }
+ ]
+ }
+];
+
+// Updated execute task function to match CLI arguments
+const executeTask = (task: BrowserCommand): string => {
+ const options: string[] = [];
+
+ if (task.provider) options.push(`--provider ${task.provider}`);
+ if (task.modelIndex !== undefined) options.push(`--model-index ${task.modelIndex}`);
+ if (task.headless) options.push('--headless');
+ if (task.vision) options.push('--vision');
+ if (task.record) {
+ options.push('--record');
+ if (task.recordPath) options.push(`--record-path ${task.recordPath}`);
+ }
+ if (task.tracePath) options.push(`--trace-path ${task.tracePath}`);
+ if (task.maxSteps) options.push(`--max-steps ${task.maxSteps}`);
+ if (task.maxActions) options.push(`--max-actions ${task.maxActions}`);
+ if (task.addInfo) options.push(`--add-info "${task.addInfo}"`);
+ if (task.windowSize) options.push(`--window-size ${task.windowSize}`);
+ if (task.userDataDir) options.push(`--user-data-dir "${task.userDataDir}"`);
+ if (task.proxy) options.push(`--proxy "${task.proxy}"`);
+
+ return `browser-use run "${task.prompt}" --url "${task.url}" ${options.join(' ')}`.trim();
+};
+
+// Example usage:
+const sequence = browserTasks[0]; // Get Product Research sequence
+console.log(`Executing sequence: ${sequence.name}`);
+sequence.tasks.forEach(task => {
+ console.log(`\n${task.description}:`);
+ console.log(executeTask(task.command));
+});
\ No newline at end of file
diff --git a/cli/browser-use b/cli/browser-use
new file mode 100755
index 00000000..4c83fb85
--- /dev/null
+++ b/cli/browser-use
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+# Get the absolute path of the script's real location (dereference symbolic link)
+REAL_SCRIPT_PATH=$(readlink -f "${BASH_SOURCE[0]}")
+
+# Get the directory of the script
+SCRIPT_DIR="$(dirname "$REAL_SCRIPT_PATH")"
+
+# Project root is one level up from the script's directory
+PROJECT_ROOT="$SCRIPT_DIR/.."
+
+# Change to the project root directory
+cd "$PROJECT_ROOT"
+
+# Activate the virtual environment
+if [ -f "venv/bin/activate" ]; then
+ echo "Activating virtual environment"
+ source "venv/bin/activate"
+ echo "VIRTUAL_ENV: $VIRTUAL_ENV"
+else
+ echo "Virtual environment activation script not found"
+fi
+
+# Create a temporary file for state transfer
+TEMP_STATE_FILE=$(mktemp)
+echo "Created temporary state file: $TEMP_STATE_FILE"
+
+# Run the Python script and capture its output
+echo "Running: venv/bin/python cli/browser_use_cli.py '$@'"
+if ! "venv/bin/python" "cli/browser_use_cli.py" "$@" --temp-file "$TEMP_STATE_FILE"; then
+ echo "Error running command. Exiting."
+ echo "Cleaning up temp file: $TEMP_STATE_FILE"
+ rm -f "$TEMP_STATE_FILE"
+ exit 1
+fi
+
+# Check the exit code of the Python script
+PYTHON_EXIT_CODE=$?
+
+# If Python script exited with a non-zero code, exit with the same code
+if [ $PYTHON_EXIT_CODE -ne 0 ]; then
+ echo "Python script exited with error code: $PYTHON_EXIT_CODE"
+ echo "Cleaning up temp file: $TEMP_STATE_FILE"
+ rm -f "$TEMP_STATE_FILE"
+ exit $PYTHON_EXIT_CODE
+fi
+
+# Read the BROWSER_USE_RUNNING value from the temporary file
+if [ -f "$TEMP_STATE_FILE" ]; then
+ BROWSER_USE_RUNNING=$(cat "$TEMP_STATE_FILE")
+ echo "Read BROWSER_USE_RUNNING from file: $BROWSER_USE_RUNNING"
+ echo "Cleaning up temp file: $TEMP_STATE_FILE"
+ rm -f "$TEMP_STATE_FILE"
+else
+ BROWSER_USE_RUNNING="false"
+ echo "Warning: Temp file not found at: $TEMP_STATE_FILE"
+ echo "Defaulting BROWSER_USE_RUNNING to: false"
+fi
+
+# Set the environment variable in the shell script based on captured value
+export BROWSER_USE_RUNNING
+echo "Environment variable BROWSER_USE_RUNNING set to: $BROWSER_USE_RUNNING"
+
+# Check if the BROWSER_USE_RUNNING environment variable is set to true
+echo "BROWSER_USE_RUNNING: $BROWSER_USE_RUNNING"
+if [ "$BROWSER_USE_RUNNING" = "true" ]; then
+ echo "Keeping virtual environment active for persistent session."
+else
+ # Deactivate the virtual environment only if not running persistently
+ if [ -n "$VIRTUAL_ENV" ]; then
+ echo "Deactivating virtual environment"
+ deactivate
+ else
+ echo "Virtual environment was not active."
+ fi
+fi
\ No newline at end of file
diff --git a/cli/browser-use.toolchain.json b/cli/browser-use.toolchain.json
new file mode 100644
index 00000000..18ca2c0b
--- /dev/null
+++ b/cli/browser-use.toolchain.json
@@ -0,0 +1,114 @@
+{
+ "name": "browser-use",
+ "description": "Execute natural language browser automation commands",
+ "type": "terminal_command",
+ "functions": [
+ {
+ "name": "browser_command",
+ "description": "Control a browser using natural language instructions",
+ "parameters": {
+ "properties": {
+ "prompt": {
+ "type": "string",
+ "description": "The natural language instruction (e.g., 'go to google.com and search for OpenAI'). **Ensure URLs are well-formed and include the protocol (e.g., https://).**"
+ },
+ "url": {
+ "type": "string",
+ "description": "The starting URL for the browser automation task. Must include the protocol (e.g., https://example.com)."
+ },
+ "provider": {
+ "type": "string",
+ "enum": [
+ "Deepseek",
+ "Google",
+ "OpenAI",
+ "Anthropic"
+ ],
+ "default": "Deepseek",
+ "description": "The LLM provider to use. DeepSeek is recommended for most tasks due to its cost-effectiveness and performance. The system will automatically select the appropriate model based on your task requirements (e.g., vision capabilities)."
+ },
+ "model_index": {
+ "type": "integer",
+ "description": "Optional index to select a specific model from the provider's available models (0-based). Available models per provider:\nDeepseek: [0: deepseek-chat]\nGoogle: [0: gemini-1.5-pro, 1: gemini-2.0-flash]\nOpenAI: [0: gpt-4o]\nAnthropic: [0: claude-3-5-sonnet-latest, 1: claude-3-5-sonnet-20241022]"
+ },
+ "vision": {
+ "type": "boolean",
+ "default": false,
+ "description": "Enable vision capabilities (optional). **When enabled, the system will automatically select a vision-capable model from your chosen provider.**"
+ },
+ "headless": {
+ "type": "boolean",
+ "default": false,
+ "description": "Run browser in headless mode (optional). **Headless mode might be necessary for certain environments or tasks but can limit interaction with visually-dependent elements.**"
+ },
+ "record": {
+ "type": "boolean",
+ "default": false,
+ "description": "Enable session recording (optional). **Useful for debugging and understanding the agent's actions.**"
+ },
+ "recordPath": {
+ "type": "string",
+ "default": "./tmp/record_videos",
+ "description": "Path to save recordings (optional). **Ensure the directory exists and is writable.**"
+ },
+ "tracePath": {
+ "type": "string",
+ "description": "Path to save debugging traces (optional). **Traces can provide detailed information about the automation process.**"
+ },
+ "maxSteps": {
+ "type": "integer",
+ "default": 10,
+ "description": "Maximum number of steps per task (optional). **Increase this for complex tasks, but be mindful of potential infinite loops.**"
+ },
+ "maxActions": {
+ "type": "integer",
+ "default": 1,
+ "description": "Maximum actions per step (optional). **Adjust this based on the complexity of each step.**"
+ },
+ "addInfo": {
+ "type": "string",
+ "description": "Additional context or instructions for the agent (optional). **Use this to provide specific details not covered in the main prompt.**"
+ },
+ "tempFile": {
+ "type": "string",
+ "description": "Path to temporary file to store the browser session state (optional). **Used for resuming or closing specific sessions.**"
+ },
+ "userDataDir": {
+ "type": "string",
+ "description": "Path to user data directory for a persistent browser session (optional). **Use this to maintain browser state across sessions (e.g., cookies, extensions).**"
+ }
+ },
+ "required": [
+ "prompt",
+ "url"
+ ]
+ }
+ }
+ ],
+ "examples": [
+ {
+ "description": "Basic usage with default provider (DeepSeek)",
+ "command": "browser-use run \"search for OpenAI\" --url \"https://www.google.com\""
+ },
+ {
+ "description": "Using Google Gemini with vision for visual analysis",
+ "command": "browser-use run \"analyze the visual layout\" --url \"https://www.openai.com\" --provider Google --vision"
+ },
+ {
+ "description": "Using OpenAI for complex analysis",
+ "command": "browser-use run \"analyze the layout and design\" --url \"https://www.example.com\" --provider OpenAI --vision"
+ },
+ {
+ "description": "Using Anthropic with specific model version",
+ "command": "browser-use run \"analyze the documentation\" --url \"https://docs.example.com\" --provider Anthropic --model-index 1"
+ },
+ {
+ "description": "Running a check in headless mode",
+ "command": "browser-use run \"check if site is up\" --url \"https://www.github.com\" --provider Deepseek --headless"
+ },
+ {
+ "description": "Recording a debug session",
+ "command": "browser-use run \"test the login process\" --url \"https://example.com\" --provider Google --record --record-path ./debug_session"
+ }
+ ]
+}
\ No newline at end of file
diff --git a/cli/browser_use_cli.py b/cli/browser_use_cli.py
new file mode 100644
index 00000000..fde15355
--- /dev/null
+++ b/cli/browser_use_cli.py
@@ -0,0 +1,411 @@
+#!/usr/bin/env python3
+import asyncio
+import argparse
+import os
+import sys
+from pathlib import Path
+import json
+import tempfile
+
+# Add the project root to PYTHONPATH
+project_root = str(Path(__file__).parent.parent)
+sys.path.append(project_root)
+
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.context import BrowserContext, BrowserContextConfig, BrowserContextWindowSize
+from src.agent.custom_agent import CustomAgent
+from src.controller.custom_controller import CustomController
+from src.agent.custom_prompts import CustomSystemPrompt
+from src.utils import utils
+from dotenv import load_dotenv
+from src.trace_analyzer import analyze_trace
+
+# Load .env from the project root
+load_dotenv(Path(project_root) / '.env')
+
+# Global variables for browser persistence
+_global_browser = None
+_global_browser_context = None
+
+def _get_browser_state():
+ """Get browser state from temporary file."""
+ temp_file = os.path.join(tempfile.gettempdir(), "browser_use_state")
+ try:
+ with open(temp_file, "r") as f:
+ return f.read().strip().lower() == "true"
+ except FileNotFoundError:
+ return False
+
+def _set_browser_state(running=True, temp_file_path=None):
+ """Set browser state in a temporary file."""
+ value = str(running).lower()
+ if temp_file_path:
+ with open(temp_file_path, "w") as f:
+ f.write(value)
+
+async def initialize_browser(
+ headless=False,
+ window_size=(1920, 1080),
+ disable_security=False,
+ user_data_dir=None,
+ proxy=None
+):
+ """Initialize a new browser instance with the given configuration."""
+ global _global_browser, _global_browser_context
+
+ # Check both environment and global variables
+ if _get_browser_state() or _global_browser is not None:
+ # Close any existing browser first
+ if _global_browser is not None:
+ await close_browser()
+ else:
+ _set_browser_state(False)
+
+ window_w, window_h = window_size
+
+ # Initialize browser with launch-time options
+ browser = Browser(
+ config=BrowserConfig(
+ headless=headless,
+ disable_security=disable_security,
+ chrome_instance_path=user_data_dir,
+ extra_chromium_args=[f"--window-size={window_w},{window_h}"],
+ proxy=proxy
+ )
+ )
+
+ # Create initial browser context
+ browser_context = await browser.new_context(
+ config=BrowserContextConfig(
+ no_viewport=False,
+ browser_window_size=BrowserContextWindowSize(
+ width=window_w,
+ height=window_h
+ ),
+ disable_security=disable_security
+ )
+ )
+
+ # Store globally
+ _global_browser = browser
+ _global_browser_context = browser_context
+ _set_browser_state(True)
+ return True
+
+async def close_browser():
+ """Close the current browser instance if one exists."""
+ global _global_browser, _global_browser_context
+
+ if _global_browser_context is not None:
+ await _global_browser_context.close()
+ _global_browser_context = None
+
+ if _global_browser is not None:
+ await _global_browser.close()
+ _global_browser = None
+
+ _set_browser_state(False)
+
+async def run_browser_task(
+ prompt,
+ url=None,
+ provider="Deepseek",
+ model_index=None,
+ vision=False,
+ record=False,
+ record_path=None,
+ trace_path=None,
+ hide_trace=False,
+ max_steps=10,
+ max_actions=1,
+ add_info="",
+ on_init=None,
+ headless=False,
+ window_size=(1920, 1080),
+ disable_security=False,
+ user_data_dir=None,
+ proxy=None
+):
+ """Execute a task using the current browser instance, auto-initializing if needed."""
+ global _global_browser, _global_browser_context
+
+ # Validate URL if provided
+ if url:
+ try:
+ from urllib.parse import urlparse
+ result = urlparse(url)
+ if not all([result.scheme, result.netloc]):
+ raise ValueError("Invalid URL format")
+ except Exception as e:
+ return f"Invalid URL provided: {str(e)}"
+
+ # Store the trace file path if tracing is enabled
+ trace_file = None
+
+ # Check if browser is running and initialize if needed
+ if not _get_browser_state():
+ print("Browser not running. Starting browser session...")
+ if not await initialize_browser(
+ headless=headless,
+ window_size=window_size,
+ disable_security=disable_security,
+ user_data_dir=user_data_dir,
+ proxy=proxy
+ ):
+ return "Browser initialization failed"
+
+ # Signal successful initialization if callback provided
+ if _get_browser_state() and on_init:
+ await on_init()
+
+ # Verify browser state is consistent
+ if _global_browser is None or _global_browser_context is None:
+ print("Browser session state is inconsistent. Attempting to reinitialize...")
+ if not await initialize_browser(
+ headless=headless,
+ window_size=window_size,
+ disable_security=disable_security,
+ user_data_dir=user_data_dir,
+ proxy=proxy
+ ):
+ return "Browser reinitialization failed"
+ if _global_browser is None or _global_browser_context is None:
+ return "Browser session state remains inconsistent after reinitialization"
+
+ # Initialize controller
+ controller = CustomController()
+
+ # Normalize provider name to lowercase for consistency
+ provider = provider.lower()
+
+ # Handle Deepseek + vision case
+ if provider == "deepseek" and vision:
+ print("WARNING: Deepseek does not support vision capabilities. Falling back to standard Deepseek model.")
+ vision = False
+
+ # Select appropriate model based on provider, model_index, and vision requirement
+ provider_key = provider
+ if provider == "google":
+ provider_key = "gemini"
+ elif provider == "openai":
+ provider_key = "openai"
+ elif provider == "anthropic":
+ provider_key = "anthropic"
+ elif provider == "deepseek":
+ provider_key = "deepseek"
+ else:
+ raise ValueError(f"Unsupported provider: {provider}")
+
+ if provider_key not in utils.model_names:
+ raise ValueError(f"No models found for provider: {provider}")
+
+ available_models = utils.model_names[provider_key]
+
+ if model_index is not None:
+ if not (0 <= model_index < len(available_models)):
+ raise ValueError(f"Invalid model_index {model_index} for provider {provider}. Available indices: 0-{len(available_models)-1}")
+ model_name = available_models[model_index]
+ else:
+ # Default model selection based on vision requirement
+ if provider_key == "deepseek":
+ model_name = available_models[0] # deepseek-chat
+ elif provider_key == "gemini":
+ model_name = available_models[0] # gemini-1.5-pro
+ elif provider_key == "openai":
+ model_name = available_models[0] # gpt-4o
+ elif provider_key == "anthropic":
+ model_name = available_models[0] # claude-3-5-sonnet-latest
+
+ # Get LLM model
+ llm = utils.get_llm_model(
+ provider=provider_key,
+ model_name=model_name,
+ temperature=0.8,
+ vision=vision
+ )
+
+ # Create new context with tracing/recording enabled
+ if record or trace_path:
+ # Close existing context first
+ if _global_browser_context is not None:
+ await _global_browser_context.close()
+
+ # Create new context with tracing/recording enabled
+ if trace_path:
+ trace_dir = Path(trace_path)
+ if not trace_path.endswith('.zip'):
+ trace_dir = trace_dir / 'trace.zip'
+ trace_dir.parent.mkdir(parents=True, exist_ok=True)
+ trace_file = str(trace_dir)
+ else:
+ trace_file = None
+
+ _global_browser_context = await _global_browser.new_context(
+ config=BrowserContextConfig(
+ trace_path=trace_file,
+ save_recording_path=str(record_path) if record else None,
+ no_viewport=False,
+ browser_window_size=BrowserContextWindowSize(
+ width=1920,
+ height=1080
+ ),
+ disable_security=False
+ )
+ )
+
+ # Initialize agent with starting URL if provided
+ agent = CustomAgent(
+ task=f"First, navigate to {url}. Then, {prompt}" if url else prompt,
+ add_infos=add_info,
+ llm=llm,
+ browser=_global_browser,
+ browser_context=_global_browser_context,
+ controller=controller,
+ system_prompt_class=CustomSystemPrompt,
+ use_vision=vision,
+ tool_call_in_content=True,
+ max_actions_per_step=max_actions
+ )
+
+ # Run task
+ history = await agent.run(max_steps=max_steps)
+ result = history.final_result()
+
+ # Close the context to ensure trace is saved
+ if _global_browser_context is not None:
+ await _global_browser_context.close()
+ _global_browser_context = None
+
+ # Analyze and display trace if enabled
+ if trace_file and not hide_trace:
+ print("\nTrace Analysis:")
+ print("=" * 50)
+ try:
+ # Find the actual trace file in the nested directory
+ trace_files = list(Path(str(trace_path)).rglob('*.zip'))
+ if trace_files:
+ actual_trace = str(trace_files[0]) # Use the first trace file found
+ print("\nTrace Analysis:")
+ print("=" * 50)
+ try:
+ trace_analysis = await analyze_trace(actual_trace)
+ print(json.dumps(trace_analysis, indent=2))
+ except Exception as e:
+ print(f"Failed to analyze trace: {e}")
+ else:
+ print("No trace file found")
+ except Exception as e:
+ print(f"Error finding trace file: {e}")
+
+ return result
+
+def main():
+ parser = argparse.ArgumentParser(description="Control a browser using natural language")
+ subparsers = parser.add_subparsers(dest="command", help="Commands")
+
+ # Start command
+ start_parser = subparsers.add_parser("start", help="Start a new browser session")
+ start_parser.add_argument("--temp-file", help="Path to temporary file for storing browser state")
+ start_parser.add_argument("--headless", action="store_true", help="Run browser in headless mode")
+ start_parser.add_argument("--window-size", default="1920x1080", help="Browser window size (WxH)")
+ start_parser.add_argument("--disable-security", action="store_true", help="Disable browser security features")
+ start_parser.add_argument("--user-data-dir", help="Use custom Chrome profile directory")
+ start_parser.add_argument("--proxy", help="Proxy server URL")
+
+ # Run command
+ run_parser = subparsers.add_parser("run", help="Run a task in the current browser session")
+ run_parser.add_argument("--temp-file", help="Path to temporary file for storing browser state")
+ run_parser.add_argument("prompt", help="The task to perform")
+ run_parser.add_argument("--url", required=True, help="The starting URL for the browser automation task")
+ run_parser.add_argument("--provider", "-p", choices=["Deepseek", "Google", "OpenAI", "Anthropic"],
+ default="Deepseek", help="The LLM provider to use (system will select appropriate model)")
+ run_parser.add_argument("--model-index", "-m", type=int,
+ help="Optional index to select a specific model from the provider's available models (0-based)")
+ run_parser.add_argument("--vision", action="store_true", help="Enable vision capabilities")
+ run_parser.add_argument("--record", action="store_true", help="Enable session recording")
+ run_parser.add_argument("--record-path", default="./tmp/record_videos", help="Path to save recordings")
+ run_parser.add_argument("--trace-path", default="./tmp/traces", help="Path to save debugging traces")
+ run_parser.add_argument("--hide-trace", action="store_true", help="Don't display trace analysis after task completion")
+ run_parser.add_argument("--max-steps", type=int, default=10, help="Maximum number of steps per task")
+ run_parser.add_argument("--max-actions", type=int, default=1, help="Maximum actions per step")
+ run_parser.add_argument("--add-info", help="Additional context for the agent")
+
+ # Close command
+ close_parser = subparsers.add_parser("close", help="Close the current browser session")
+ close_parser.add_argument("--temp-file", help="Path to temporary file for storing browser state")
+
+ # Analyze trace command
+ analyze_parser = subparsers.add_parser("analyze-trace", help="Analyze a Playwright trace file")
+ analyze_parser.add_argument("trace_path", help="Path to the trace file")
+ analyze_parser.add_argument("--output", "-o", help="Path to save the analysis output (default: print to stdout)")
+
+ args = parser.parse_args()
+
+ if args.command == "start":
+ # Parse window size
+ try:
+ window_w, window_h = map(int, args.window_size.split('x'))
+ except ValueError:
+ print(f"Invalid window size format: {args.window_size}. Using default 1920x1080")
+ window_w, window_h = 1920, 1080
+
+ # Start browser
+ success = asyncio.run(initialize_browser(
+ headless=args.headless,
+ window_size=(window_w, window_h),
+ disable_security=args.disable_security,
+ user_data_dir=args.user_data_dir,
+ proxy=args.proxy
+ ))
+ if success:
+ print("Browser session started successfully")
+ _set_browser_state(True, args.temp_file)
+ else:
+ print("Failed to start browser session")
+ _set_browser_state(False, args.temp_file)
+
+ elif args.command == "run":
+ # Run task
+ result = asyncio.run(run_browser_task(
+ prompt=args.prompt,
+ url=args.url,
+ provider=args.provider,
+ model_index=args.model_index,
+ vision=args.vision,
+ record=args.record,
+ record_path=args.record_path if args.record else None,
+ trace_path=args.trace_path,
+ hide_trace=args.hide_trace,
+ max_steps=args.max_steps,
+ max_actions=args.max_actions,
+ add_info=args.add_info,
+ headless=False,
+ window_size=(1920, 1080),
+ disable_security=False,
+ user_data_dir=None,
+ proxy=None
+ ))
+ if result:
+ print(result)
+
+ elif args.command == "close":
+ # Close browser
+ asyncio.run(close_browser())
+ print("Browser session closed")
+ _set_browser_state(False, args.temp_file)
+
+ elif args.command == "analyze-trace":
+ # Analyze trace
+ result = asyncio.run(analyze_trace(args.trace_path))
+ if args.output:
+ with open(args.output, 'w') as f:
+ json.dump(result, f, indent=2)
+ print(f"Analysis saved to {args.output}")
+ else:
+ print(json.dumps(result, indent=2))
+
+ else:
+ parser.print_help()
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/cli/usage-guide.md b/cli/usage-guide.md
new file mode 100644
index 00000000..8a26a61e
--- /dev/null
+++ b/cli/usage-guide.md
@@ -0,0 +1,308 @@
+# Browser-Use API Usage Guide
+
+## Overview
+
+This guide explains how to use the browser-use API to automate browser interactions using different LLM models. The API provides a powerful way to control a browser programmatically through Python.
+
+## Basic Setup
+
+```python
+import asyncio
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.context import BrowserContextConfig, BrowserContextWindowSize
+from src.agent.custom_agent import CustomAgent
+from src.controller.custom_controller import CustomController
+from src.agent.custom_prompts import CustomSystemPrompt
+from src.utils import utils
+import os
+
+# Window size configuration
+window_w, window_h = 1920, 1080
+
+# Browser initialization
+browser = Browser(
+ config=BrowserConfig(
+ headless=False, # Set to True for headless mode
+ disable_security=True,
+ extra_chromium_args=[f"--window-size={window_w},{window_h}"],
+ )
+)
+```
+
+## Browser Context Configuration
+
+```python
+# Create a browser context with recording and tracing
+browser_context = await browser.new_context(
+ config=BrowserContextConfig(
+ trace_path="./tmp/traces", # For debugging
+ save_recording_path="./tmp/record_videos", # For session recording
+ no_viewport=False,
+ browser_window_size=BrowserContextWindowSize(
+ width=window_w, height=window_h
+ ),
+ )
+)
+```
+
+## Model Configuration
+
+### DeepSeek (Default)
+
+```python
+llm = utils.get_llm_model(
+ provider="deepseek",
+ model_name="deepseek-chat", # V2.5 model
+ temperature=0.8,
+ base_url="https://api.deepseek.com/v1",
+ api_key=os.getenv("DEEPSEEK_API_KEY", "")
+)
+```
+
+### Gemini Pro
+
+```python
+llm = utils.get_llm_model(
+ provider="gemini",
+ model_name="gemini-2.0-flash-exp",
+ temperature=1.0,
+ api_key=os.getenv("GOOGLE_API_KEY", "")
+)
+```
+
+### GPT-4 Turbo
+
+```python
+llm = utils.get_llm_model(
+ provider="openai",
+ model_name="gpt-4-turbo-preview",
+ temperature=0.8,
+ api_key=os.getenv("OPENAI_API_KEY", "")
+)
+```
+
+### Claude-3 Opus
+
+```python
+llm = utils.get_llm_model(
+ provider="anthropic",
+ model_name="claude-3-opus-20240229",
+ temperature=0.8,
+ api_key=os.getenv("ANTHROPIC_API_KEY", "")
+)
+```
+
+## Agent Configuration
+
+```python
+# Initialize controller
+controller = CustomController()
+
+# Initialize agent
+agent = CustomAgent(
+ task="your task description here",
+ add_infos="", # Optional hints for the LLM
+ llm=llm, # LLM model configured above
+ browser=browser,
+ browser_context=browser_context,
+ controller=controller,
+ system_prompt_class=CustomSystemPrompt,
+ use_vision=False, # Must be False for DeepSeek
+ tool_call_in_content=True, # Required for DeepSeek
+ max_actions_per_step=1 # Control action granularity
+)
+```
+
+## Running Tasks
+
+```python
+# Run the agent with a maximum number of steps
+history = await agent.run(max_steps=10)
+
+# Access results
+print("Final Result:", history.final_result())
+print("Errors:", history.errors())
+print("Model Actions:", history.model_actions())
+print("Thoughts:", history.model_thoughts())
+```
+
+## Common Tasks
+
+### Navigation
+
+```python
+task="go to google.com"
+```
+
+### Search
+
+```python
+task="go to google.com and search for 'OpenAI'"
+```
+
+### Form Filling
+
+```python
+task="go to example.com/login and fill in username 'user' and password 'pass'"
+```
+
+### Clicking Elements
+
+```python
+task="click the 'Submit' button"
+```
+
+## Model-Specific Considerations
+
+1. **DeepSeek**
+ - Set `use_vision=False`
+ - Set `tool_call_in_content=True`
+ - Uses OpenAI-compatible API format
+
+2. **Gemini**
+ - Set `use_vision=True`
+ - Works well with visual tasks
+
+3. **GPT-4 & Claude-3**
+ - Support both vision and non-vision tasks
+ - Higher reasoning capabilities for complex tasks
+
+## Best Practices
+
+1. **Error Handling**
+ - Always check `history.errors()` for any issues
+ - Monitor `history.model_thoughts()` for debugging
+
+2. **Resource Management**
+ - Use async context managers for browser and context
+ - Close resources properly after use
+
+3. **Task Description**
+ - Be specific and clear in task descriptions
+ - Include necessary context in `add_infos`
+
+4. **Performance**
+ - Use `headless=True` for automated tasks
+ - Adjust `max_steps` and `max_actions_per_step` based on task complexity
+
+## Example Implementation
+
+```python
+async def main():
+ # Browser setup
+ browser = Browser(config=BrowserConfig(...))
+
+ async with await browser.new_context(...) as browser_context:
+ # Controller setup
+ controller = CustomController()
+
+ # Agent setup
+ agent = CustomAgent(
+ task="your task",
+ llm=your_configured_llm,
+ browser=browser,
+ browser_context=browser_context,
+ controller=controller,
+ system_prompt_class=CustomSystemPrompt,
+ use_vision=False,
+ tool_call_in_content=True,
+ max_actions_per_step=1
+ )
+
+ # Run task
+ history = await agent.run(max_steps=10)
+
+ # Process results
+ print(history.final_result())
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+## Troubleshooting
+
+1. **JSON Schema Errors with DeepSeek**
+ - Ensure using latest DeepSeek V2.5 endpoint
+ - Verify correct base URL and API key
+ - Use `tool_call_in_content=True`
+
+2. **Browser Connection Issues**
+ - Check browser configuration
+ - Verify Chrome/Chromium installation
+ - Ensure proper port access
+
+3. **Model Response Issues**
+ - Adjust temperature for more/less deterministic behavior
+ - Try different models for complex tasks
+ - Check API key validity and quotas
+
+## Tracing and Debugging
+
+### Enabling Tracing
+
+```python
+# Enable tracing in browser context
+browser_context = await browser.new_context(
+ config=BrowserContextConfig(
+ trace_path="./tmp/traces/trace.zip", # Must have .zip extension
+ no_viewport=False,
+ browser_window_size=BrowserContextWindowSize(
+ width=window_w, height=window_h
+ ),
+ )
+)
+```
+
+### Using Traces for Debugging
+
+1. **Recording Traces**
+ - Traces are automatically saved when `trace_path` is provided
+ - Files are saved with `.zip` extension
+ - Contains browser actions, network requests, and screenshots
+
+2. **Analyzing Traces**
+ - Use Playwright Trace Viewer to analyze traces
+ - View step-by-step browser actions
+ - Inspect network requests and responses
+ - Review page states at each step
+
+## Report Generation
+
+### Best Practices
+
+1. **Structure**
+ - Always include page title and headings
+ - List interactive elements with their types
+ - Provide clear hierarchy of content
+ - Include relevant metadata (URLs, timestamps)
+
+2. **Content**
+ - Focus on task-relevant information
+ - Include both static and dynamic content
+ - Document interactive elements and their states
+ - Note any errors or warnings
+
+3. **Format**
+ - Use clear section headings
+ - Include numbered or bulleted lists
+ - Add summary sections for complex pages
+ - Use markdown formatting for readability
+
+### Example Report Task
+
+```python
+task = "create a report about the page structure, including any interactive elements found"
+add_infos = "Focus on navigation elements and forms"
+
+agent = CustomAgent(
+ task=task,
+ add_infos=add_infos,
+ llm=llm,
+ browser=browser,
+ browser_context=browser_context,
+ controller=controller,
+ system_prompt_class=CustomSystemPrompt,
+ use_vision=True, # Enable vision for better structure analysis
+ max_actions_per_step=1
+)
+```
diff --git a/demo_logging.py b/demo_logging.py
new file mode 100644
index 00000000..f7c70093
--- /dev/null
+++ b/demo_logging.py
@@ -0,0 +1,99 @@
+import asyncio
+from src.utils.task_logging import (
+ TaskLogger, TaskStatus, ActionType, RetryConfig,
+ ColorScheme, SeparatorStyle
+)
+
+async def demo_logging():
+ # Initialize logger with custom styles
+ logger = TaskLogger(
+ "demo_task",
+ "Demonstrate all logging features",
+ color_scheme=ColorScheme(),
+ separator_style=SeparatorStyle(
+ task="★" * 40,
+ phase="•" * 30,
+ error="!" * 35
+ )
+ )
+
+ # Start navigation phase
+ logger.start_phase("Navigation Phase")
+ logger.update_step(
+ "Navigate to example.com",
+ TaskStatus.RUNNING,
+ action_type=ActionType.NAVIGATION,
+ context={"url": "https://example.com"}
+ )
+
+ # Update browser state
+ logger.update_browser_state(
+ url="https://example.com",
+ page_ready=True,
+ dynamic_content_loaded=True,
+ visible_elements=15,
+ page_title="Example Domain"
+ )
+
+ # Complete navigation
+ logger.update_step(
+ "Page loaded successfully",
+ TaskStatus.COMPLETE,
+ action_type=ActionType.NAVIGATION,
+ progress=0.25,
+ results={"status": 200, "load_time": 0.5}
+ )
+
+ # Start interaction phase
+ logger.start_phase("Interaction Phase")
+ logger.update_step(
+ "Click search button",
+ TaskStatus.RUNNING,
+ action_type=ActionType.INTERACTION,
+ context={"element": "search_button"}
+ )
+
+ # Simulate error and retry
+ async def failing_operation():
+ raise ValueError("Search button not found")
+
+ try:
+ await logger.execute_with_retry(
+ failing_operation,
+ "click_search",
+ RetryConfig(max_retries=2, base_delay=0.1)
+ )
+ except ValueError:
+ pass
+
+ # Start extraction phase
+ logger.start_phase("Data Extraction Phase")
+ logger.update_step(
+ "Extract search results",
+ TaskStatus.RUNNING,
+ action_type=ActionType.EXTRACTION,
+ progress=0.75
+ )
+
+ # Complete extraction
+ logger.update_step(
+ "Data extracted successfully",
+ TaskStatus.COMPLETE,
+ action_type=ActionType.EXTRACTION,
+ progress=1.0,
+ results={"items_found": 10}
+ )
+
+ # Display log history
+ print("\nLog History:")
+ print("=" * 80)
+ for entry in logger.get_log_history():
+ print(entry)
+ print("=" * 80)
+
+ # Log final state
+ print("\nFinal State:")
+ logger.log_state()
+
+if __name__ == "__main__":
+ asyncio.run(demo_logging())
\ No newline at end of file
diff --git a/docs/enhanced_tracing.md b/docs/enhanced_tracing.md
new file mode 100644
index 00000000..b69aea07
--- /dev/null
+++ b/docs/enhanced_tracing.md
@@ -0,0 +1,297 @@
+# Enhanced Tracing Documentation
+
+## Overview
+
+The enhanced tracing system provides detailed insights into browser automation actions, decision-making processes, and error recovery strategies. This documentation covers all major components and their usage.
+
+## Components
+
+### 1. Action Context
+Captures detailed information about element states and interactions.
+
+```json
+{
+ "action_context": {
+ "element_state_before": {
+ "visible": true,
+ "computed_styles": {
+ "pointer-events": "auto",
+ "opacity": "1",
+ "z-index": "100"
+ },
+ "focus_state": "not-focused",
+ "accessibility": {
+ "aria-hidden": "false",
+ "aria-disabled": "false"
+ }
+ },
+ "element_state_after": {
+ "visible": true,
+ "focus_state": "focused",
+ "triggered_events": ["click", "focus"]
+ }
+ }
+}
+```
+
+**Key Features:**
+- Before/after state tracking
+- Computed style analysis
+- Focus and accessibility state monitoring
+- Event triggering information
+
+### 2. Decision Trail
+Records the AI model's decision-making process and confidence levels.
+
+```json
+{
+ "decision_trail": {
+ "confidence_threshold": 0.8,
+ "attention_weights": {
+ "element_text": 0.6,
+ "aria_label": 0.3,
+ "position": 0.1
+ },
+ "alternative_paths": [
+ {
+ "action": "click hamburger menu",
+ "rejected_reason": "settings directly visible",
+ "confidence": 0.4
+ }
+ ]
+ }
+}
+```
+
+**Key Features:**
+- Confidence thresholds
+- Attention weight distribution
+- Alternative action consideration
+- Rejection reasoning
+
+### 3. Element Identification
+Provides comprehensive element location and relationship information.
+
+```json
+{
+ "element_identification": {
+ "relative_position": {
+ "from_top_nav": "20px from right",
+ "from_viewport": "top-right quadrant"
+ },
+ "hierarchy": {
+ "parent": "nav.top-bar",
+ "siblings": ["button.new-template", "button.help"],
+ "children": ["span.icon", "span.text"]
+ }
+ }
+}
+```
+
+**Key Features:**
+- Relative positioning
+- Element hierarchy
+- Sibling relationships
+- Visual landmarks
+
+### 4. Visual State Tracking
+Monitors visual changes and layout shifts during automation.
+
+```json
+{
+ "visual_state": {
+ "screenshot_diffs": {
+ "before_click": "diff_1.png",
+ "after_click": "diff_2.png",
+ "changes_highlighted": true
+ },
+ "layout_shifts": [
+ {
+ "timestamp": "T+100ms",
+ "elements_moved": ["#settings-panel"],
+ "cumulative_layout_shift": 0.1
+ }
+ ]
+ }
+}
+```
+
+**Key Features:**
+- Screenshot diffing
+- Layout shift tracking
+- Element visibility analysis
+- Viewport position monitoring
+
+### 5. Error Recovery
+Provides sophisticated error handling and recovery strategies.
+
+```json
+{
+ "error_recovery": {
+ "retry_strategy": {
+ "backoff": "exponential",
+ "max_attempts": 3,
+ "conditions": {
+ "network_stable": true,
+ "animations_complete": true
+ }
+ },
+ "environment_factors": {
+ "network_conditions": {
+ "latency": "50ms",
+ "bandwidth": "10Mbps"
+ }
+ }
+ }
+}
+```
+
+**Key Features:**
+- Retry strategies
+- Environmental monitoring
+- Recovery checkpoints
+- State restoration
+
+### 6. Performance Monitoring
+Tracks timing and performance metrics.
+
+```json
+{
+ "timing_analysis": {
+ "action_breakdown": {
+ "element_search": "150ms",
+ "interaction_delay": "50ms",
+ "animation_duration": "200ms"
+ },
+ "performance_markers": {
+ "first_paint": "100ms",
+ "first_contentful_paint": "200ms"
+ }
+ }
+}
+```
+
+**Key Features:**
+- Action timing breakdown
+- Performance markers
+- Cumulative timing
+- Resource utilization
+
+## Usage
+
+### Basic Usage
+```python
+analyzer = EnhancedTraceAnalyzer(trace_file_path)
+result = await analyzer.analyze_all()
+```
+
+### Component-Specific Analysis
+```python
+# Analyze specific components
+timing = await analyzer.analyze_timing()
+visual = await analyzer.analyze_visual_state()
+recovery = await analyzer.analyze_error_recovery()
+```
+
+### Error Recovery Integration
+```python
+recovery_info = await analyzer.analyze_recovery_info()
+if recovery_info["retry_strategy"]["backoff"] == "exponential":
+ # Implement exponential backoff retry logic
+```
+
+## Best Practices
+
+1. **Performance Optimization**
+ - Monitor cumulative timing metrics
+ - Track resource utilization
+ - Optimize retry strategies
+
+2. **Error Recovery**
+ - Use exponential backoff for retries
+ - Monitor environmental factors
+ - Maintain state checkpoints
+
+3. **Visual Verification**
+ - Use screenshot diffs for validation
+ - Monitor layout shifts
+ - Track element visibility
+
+4. **Decision Making**
+ - Review confidence thresholds
+ - Analyze attention weights
+ - Consider alternative paths
+
+## Common Issues and Solutions
+
+### 1. Element Not Found
+```json
+{
+ "error_recovery": {
+ "retry_strategy": {
+ "backoff": "exponential",
+ "conditions": {
+ "animations_complete": true
+ }
+ }
+ }
+}
+```
+**Solution:** Wait for animations to complete and retry with exponential backoff.
+
+### 2. Layout Shifts
+```json
+{
+ "visual_state": {
+ "layout_shifts": [
+ {
+ "cumulative_layout_shift": 0.1
+ }
+ ]
+ }
+}
+```
+**Solution:** Monitor CLS and wait for layout stability before interactions.
+
+### 3. Network Issues
+```json
+{
+ "environment_factors": {
+ "network_conditions": {
+ "stability": "unstable"
+ }
+ }
+}
+```
+**Solution:** Implement network condition checks in retry strategy.
+
+## API Reference
+
+### EnhancedTraceAnalyzer Methods
+
+#### analyze_action_context()
+Returns detailed information about element states and interactions.
+
+#### analyze_decision_trail()
+Returns the AI model's decision-making process and confidence levels.
+
+#### analyze_element_identification()
+Returns comprehensive element location and relationship information.
+
+#### analyze_visual_state()
+Returns visual changes and layout shift information.
+
+#### analyze_error_recovery()
+Returns error handling and recovery strategies.
+
+#### analyze_timing()
+Returns detailed timing and performance metrics.
+
+## Contributing
+
+When adding new tracing features:
+
+1. Follow the existing data structure pattern
+2. Add comprehensive test coverage
+3. Update documentation with examples
+4. Include error handling cases
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..c260ce12
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,23 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "browser-use"
+version = "0.1.19"
+authors = [
+ { name = "Your Name", email = "your.email@example.com" }
+]
+description = "A Python package for browser automation with AI"
+readme = "README.md"
+requires-python = ">=3.11"
+classifiers = [
+ "Programming Language :: Python :: 3",
+ "License :: OSI Approved :: MIT License",
+ "Operating System :: OS Independent",
+]
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["src*"]
+namespaces = false
\ No newline at end of file
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 00000000..88001be3
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,19 @@
+[pytest]
+asyncio_mode = auto
+asyncio_default_fixture_loop_scope = function
+
+# Test discovery
+testpaths = tests
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+
+# Output configuration
+console_output_style = count
+log_cli = True
+log_cli_level = INFO
+
+# Warnings
+filterwarnings =
+ ignore::DeprecationWarning
+ ignore::pytest.PytestDeprecationWarning
\ No newline at end of file
diff --git a/pytest_output.txt b/pytest_output.txt
new file mode 100644
index 00000000..fe9b67ce
--- /dev/null
+++ b/pytest_output.txt
@@ -0,0 +1,64 @@
+============================= test session starts ==============================
+platform darwin -- Python 3.11.9, pytest-8.3.4, pluggy-1.5.0 -- /Users/dmieloch/Dev/experiments/web-ui/venv/bin/python
+cachedir: .pytest_cache
+rootdir: /Users/dmieloch/Dev/experiments/web-ui
+configfile: pytest.ini
+plugins: cov-6.0.0, asyncio-0.25.2, anyio-4.8.0, timeout-2.3.1
+asyncio: mode=Mode.AUTO, asyncio_default_fixture_loop_scope=function
+collecting ...
+----------------------------- live log collection ------------------------------
+INFO root:service.py:51 Anonymized telemetry enabled. See https://github.com/gregpr07/browser-use for more information.
+INFO httpx:_client.py:1038 HTTP Request: GET https://api.gradio.app/gradio-messaging/en "HTTP/1.1 200 OK"
+collected 28 items
+
+tests/test_browser_cli.py::TestBrowserInitialization::test_basic_initialization
+-------------------------------- live log setup --------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:28 Cleanup start - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:39 Globals and environment reset before test
+PASSED [ 1/28]
+------------------------------ live log teardown -------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:45 Cleanup finally - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:65 Globals and environment reset after test
+
+tests/test_browser_cli.py::TestBrowserInitialization::test_window_size
+-------------------------------- live log setup --------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:28 Cleanup start - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:39 Globals and environment reset before test
+-------------------------------- live log call ---------------------------------
+INFO src.agent.custom_agent:custom_agent.py:356 🚀 Starting task: go to data:text/html,
+INFO src.agent.custom_agent:custom_agent.py:196
+📍 Step 1
+INFO httpx:_client.py:1786 HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 422 Unprocessable Entity"
+INFO httpx:_client.py:1038 HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"
+INFO src.agent.custom_agent:custom_agent.py:128 🤷 Eval: Unknown - No previous actions to evaluate.
+INFO src.agent.custom_agent:custom_agent.py:129 🧠 New Memory:
+INFO src.agent.custom_agent:custom_agent.py:130 ⏳ Task Progress:
+INFO src.agent.custom_agent:custom_agent.py:131 🤔 Thought: The task requires navigating to a specific URL to display the window size. The current page is 'about:blank', and no actions have been taken yet.
+INFO src.agent.custom_agent:custom_agent.py:132 🎯 Summary: Navigate to the specified URL to display the window size.
+INFO src.agent.custom_agent:custom_agent.py:134 🛠️ Action 1/1: {"go_to_url":{"url":"data:text/html,"}}
+INFO src.agent.custom_agent:custom_agent.py:207 🧠 All Memory:
+INFO browser_use.controller.service:service.py:59 🔗 Navigated to data:text/html,
+INFO src.agent.custom_agent:custom_agent.py:196
+📍 Step 2
+INFO httpx:_client.py:1786 HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 422 Unprocessable Entity"
+INFO httpx:_client.py:1038 HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"
+INFO src.agent.custom_agent:custom_agent.py:128 ✅ Eval: Success - Successfully navigated to the specified URL and displayed the window size.
+INFO src.agent.custom_agent:custom_agent.py:129 🧠 New Memory: Window size: 800x600
+INFO src.agent.custom_agent:custom_agent.py:130 ⏳ Task Progress: 1. Navigated to the specified URL to display the window size.
+INFO src.agent.custom_agent:custom_agent.py:131 🤔 Thought: The task has been completed as the window size is now displayed on the page. No further actions are required.
+INFO src.agent.custom_agent:custom_agent.py:132 🎯 Summary: The task is complete. The window size is displayed as 800x600.
+INFO src.agent.custom_agent:custom_agent.py:134 🛠️ Action 1/1: {"done":{"text":"The task is complete. The window size is displayed as 800x600."}}
+INFO src.agent.custom_agent:custom_agent.py:207 🧠 All Memory: Window size: 800x600
+
+INFO src.agent.custom_agent:custom_agent.py:218 📄 Result: The task is complete. The window size is displayed as 800x600.
+INFO src.agent.custom_agent:custom_agent.py:399 ✅ Task completed successfully
+WARNING src.agent.custom_agent:custom_agent.py:260 No history or first screenshot to create GIF from
+PASSED [ 2/28]
+------------------------------ live log teardown -------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:45 Cleanup finally - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:65 Globals and environment reset after test
+
+tests/test_browser_cli.py::TestBrowserInitialization::test_headless_mode
+-------------------------------- live log setup --------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:28 Cleanup start - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:39 Globals and environment reset before test
diff --git a/src/__init__.py b/src/__init__.py
index 93fbe7f8..0edfbf30 100644
--- a/src/__init__.py
+++ b/src/__init__.py
@@ -3,4 +3,23 @@
# @Author : wenshao
# @Email : wenshaoguo1026@gmail.com
# @Project : browser-use-webui
-# @FileName: __init__.py.py
+# @FileName: __init__.py
+
+from browser_use.browser.browser import Browser
+from browser_use.browser.browser import BrowserConfig
+from browser_use.browser.context import BrowserContextConfig, BrowserContextWindowSize
+from .agent.custom_agent import CustomAgent
+from .controller.custom_controller import CustomController
+from .agent.custom_prompts import CustomSystemPrompt
+from .utils import utils
+
+__all__ = [
+ 'Browser',
+ 'BrowserConfig',
+ 'BrowserContextConfig',
+ 'BrowserContextWindowSize',
+ 'CustomAgent',
+ 'CustomController',
+ 'CustomSystemPrompt',
+ 'utils'
+]
diff --git a/src/agent/custom_agent.py b/src/agent/custom_agent.py
index ff8908c8..0332067d 100644
--- a/src/agent/custom_agent.py
+++ b/src/agent/custom_agent.py
@@ -8,11 +8,12 @@
import logging
import pdb
import traceback
-from typing import Optional, Type
+from typing import Optional, Type, Any, Dict
from PIL import Image, ImageDraw, ImageFont
import os
import base64
import io
+import datetime
from browser_use.agent.prompts import SystemPrompt
from browser_use.agent.service import Agent
@@ -37,11 +38,13 @@
BaseMessage,
)
from src.utils.agent_state import AgentState
+from src.utils.logging import BatchedEventLogger
from .custom_massage_manager import CustomMassageManager
from .custom_views import CustomAgentOutput, CustomAgentStepInfo
logger = logging.getLogger(__name__)
+batched_logger = BatchedEventLogger(logger)
class CustomAgent(Agent):
@@ -117,23 +120,41 @@ def _setup_action_models(self) -> None:
self.AgentOutput = CustomAgentOutput.type_with_custom_actions(self.ActionModel)
def _log_response(self, response: CustomAgentOutput) -> None:
- """Log the model's response"""
- if "Success" in response.current_state.prev_action_evaluation:
- emoji = "✅"
- elif "Failed" in response.current_state.prev_action_evaluation:
- emoji = "❌"
- else:
- emoji = "🤷"
-
- logger.info(f"{emoji} Eval: {response.current_state.prev_action_evaluation}")
- logger.info(f"🧠 New Memory: {response.current_state.important_contents}")
- logger.info(f"⏳ Task Progress: {response.current_state.completed_contents}")
- logger.info(f"🤔 Thought: {response.current_state.thought}")
- logger.info(f"🎯 Summary: {response.current_state.summary}")
+ """Log the model's response in a structured format"""
+ evaluation_status = "success" if "Success" in response.current_state.prev_action_evaluation else "failed"
+
+ log_data = {
+ "timestamp": datetime.datetime.now().isoformat(),
+ "action": "model_response",
+ "status": evaluation_status,
+ "state": {
+ "evaluation": response.current_state.prev_action_evaluation,
+ "memory": response.current_state.important_contents,
+ "progress": response.current_state.completed_contents,
+ "thought": response.current_state.thought,
+ "summary": response.current_state.summary
+ }
+ }
+
+ logger.info(
+ f"Model Response: {evaluation_status}",
+ extra={
+ "event_type": "model_response",
+ "event_data": log_data
+ }
+ )
+
+ # Batch action logging
for i, action in enumerate(response.action):
- logger.info(
- f"🛠️ Action {i + 1}/{len(response.action)}: {action.model_dump_json(exclude_unset=True)}"
+ batched_logger.add_event(
+ "action",
+ {
+ "action_number": i + 1,
+ "total_actions": len(response.action),
+ "action_data": json.loads(action.model_dump_json(exclude_unset=True))
+ }
)
+ batched_logger.flush()
def update_step_info(
self, model_output: CustomAgentOutput, step_info: CustomAgentStepInfo = None
@@ -193,7 +214,19 @@ async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutpu
@time_execution_async("--step")
async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
"""Execute one step of the task"""
- logger.info(f"\n📍 Step {self.n_steps}")
+ step_data = {
+ "step_number": self.n_steps,
+ "timestamp": datetime.datetime.now().isoformat()
+ }
+
+ logger.info(
+ f"Starting step {self.n_steps}",
+ extra={
+ "event_type": "step_start",
+ "event_data": step_data
+ }
+ )
+
state = None
model_output = None
result: list[ActionResult] = []
@@ -204,9 +237,18 @@ async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
input_messages = self.message_manager.get_messages()
model_output = await self.get_next_action(input_messages)
self.update_step_info(model_output, step_info)
- logger.info(f"🧠 All Memory: {step_info.memory}")
+
+ if step_info:
+ logger.debug(
+ "Step memory updated",
+ extra={
+ "event_type": "memory_update",
+ "event_data": {"memory": step_info.memory}
+ }
+ )
+
self._save_conversation(input_messages, model_output)
- self.message_manager._remove_last_state_message() # we dont want the whole state in the chat history
+ self.message_manager._remove_last_state_message()
self.message_manager.add_model_output(model_output)
result: list[ActionResult] = await self.controller.multi_act(
@@ -215,17 +257,37 @@ async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
self._last_result = result
if len(result) > 0 and result[-1].is_done:
- logger.info(f"📄 Result: {result[-1].extracted_content}")
+ logger.info(
+ "Task completed",
+ extra={
+ "event_type": "task_complete",
+ "event_data": {
+ "result": result[-1].extracted_content
+ }
+ }
+ )
self.consecutive_failures = 0
except Exception as e:
result = self._handle_step_error(e)
self._last_result = result
+ logger.error(
+ f"Step error: {str(e)}",
+ extra={
+ "event_type": "step_error",
+ "event_data": {
+ "error": str(e),
+ "traceback": traceback.format_exc()
+ }
+ },
+ exc_info=True
+ )
finally:
if not result:
return
+
for r in result:
if r.error:
self.telemetry.capture(
@@ -234,8 +296,28 @@ async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
error=r.error,
)
)
+ logger.error(
+ f"Action error: {r.error}",
+ extra={
+ "event_type": "action_error",
+ "event_data": {
+ "error": r.error
+ }
+ }
+ )
+
if state:
self._make_history_item(model_output, state, result)
+
+ step_data["status"] = "completed"
+ logger.info(
+ f"Step {self.n_steps} completed",
+ extra={
+ "event_type": "step_complete",
+ "event_data": step_data
+ }
+ )
+
def create_history_gif(
self,
output_path: str = 'agent_history.gif',
diff --git a/src/agent/custom_prompts.py b/src/agent/custom_prompts.py
index 56aeb64b..b64b3b9f 100644
--- a/src/agent/custom_prompts.py
+++ b/src/agent/custom_prompts.py
@@ -66,9 +66,10 @@ def important_rules(self) -> str:
- Use scroll to find elements you are looking for
5. TASK COMPLETION:
- - If you think all the requirements of user\'s instruction have been completed and no further operation is required, output the done action to terminate the operation process.
+ - If you think all the requirements of user's instruction have been completed and no further operation is required, output the done action to terminate the operation process.
- Don't hallucinate actions.
- If the task requires specific information - make sure to include everything in the done function. This is what the user will see.
+ - When generating reports about page structure, always include the page title and headings.
- If you are running out of steps (current step), think about speeding it up, and ALWAYS use the done action as the last action.
6. VISUAL CONTEXT:
@@ -163,13 +164,13 @@ def __init__(
def get_user_message(self) -> HumanMessage:
state_description = f"""
- 1. Task: {self.step_info.task}
+ 1. Task: {self.step_info.task if self.step_info else ""}
2. Hints(Optional):
- {self.step_info.add_infos}
+ {self.step_info.add_infos if self.step_info else ""}
3. Memory:
- {self.step_info.memory}
+ {self.step_info.memory if self.step_info else ""}
4. Task Progress:
- {self.step_info.task_progress}
+ {self.step_info.task_progress if self.step_info else ""}
5. Current url: {self.state.url}
6. Available tabs:
{self.state.tabs}
diff --git a/src/browser/custom_context.py b/src/browser/custom_context.py
index 6de991bf..c0aa1961 100644
--- a/src/browser/custom_context.py
+++ b/src/browser/custom_context.py
@@ -8,6 +8,7 @@
import json
import logging
import os
+from pathlib import Path
from browser_use.browser.browser import Browser
from browser_use.browser.context import BrowserContext, BrowserContextConfig
@@ -25,6 +26,7 @@ def __init__(
config: BrowserContextConfig = BrowserContextConfig()
):
super(CustomBrowserContext, self).__init__(browser=browser, config=config)
+ self._context = None
async def _create_context(self, browser: PlaywrightBrowser) -> PlaywrightBrowserContext:
"""Creates a new browser context with anti-detection measures and loads cookies if available."""
@@ -93,4 +95,20 @@ async def _create_context(self, browser: PlaywrightBrowser) -> PlaywrightBrowser
"""
)
+ self._context = context
return context
+
+ @property
+ def context(self) -> PlaywrightBrowserContext | None:
+ """Get the underlying Playwright browser context."""
+ return self._context
+
+ async def close(self):
+ """Close the browser context and stop tracing if enabled."""
+ if self.config.trace_path and self._context:
+ trace_path = Path(self.config.trace_path)
+ trace_path.parent.mkdir(parents=True, exist_ok=True)
+ if not trace_path.suffix:
+ trace_path = trace_path / "trace.zip"
+ await self._context.tracing.stop(path=str(trace_path))
+ await super().close()
diff --git a/src/controller/custom_controller.py b/src/controller/custom_controller.py
index 6e57dd4a..21a56b5a 100644
--- a/src/controller/custom_controller.py
+++ b/src/controller/custom_controller.py
@@ -8,6 +8,7 @@
from browser_use.agent.views import ActionResult
from browser_use.browser.context import BrowserContext
from browser_use.controller.service import Controller
+from browser_use.browser.views import BrowserState
class CustomController(Controller):
@@ -31,3 +32,8 @@ async def paste_from_clipboard(browser: BrowserContext):
await page.keyboard.type(text)
return ActionResult(extracted_content=text)
+
+ async def get_browser_state(self, browser_context: BrowserContext) -> BrowserState:
+ """Get the current state of the browser"""
+ state = await browser_context.get_state(use_vision=True)
+ return state
diff --git a/src/trace_analyzer.py b/src/trace_analyzer.py
new file mode 100644
index 00000000..0590dd5e
--- /dev/null
+++ b/src/trace_analyzer.py
@@ -0,0 +1,644 @@
+import json
+import zipfile
+from pathlib import Path
+from typing import Dict, List, Optional, Any
+import asyncio
+
+class PlaywrightTrace:
+ def __init__(self, trace_path: str):
+ self.trace_path = Path(trace_path)
+ self.actions: List[Dict[str, Any]] = []
+ self.network_requests: List[Dict[str, Any]] = []
+ self.console_logs: List[str] = []
+ self.errors: List[str] = []
+
+ @classmethod
+ async def parse(cls, trace_path: str) -> 'PlaywrightTrace':
+ """Parse a Playwright trace file and return a PlaywrightTrace instance."""
+ trace = cls(trace_path)
+ await trace._parse_trace_file()
+ return trace
+
+ async def _parse_trace_file(self):
+ """Parse the trace.zip file and extract relevant information."""
+ if not self.trace_path.exists():
+ raise FileNotFoundError(f"Trace file not found: {self.trace_path}")
+
+ try:
+ with zipfile.ZipFile(self.trace_path, 'r') as zip_ref:
+ # List all files in the zip
+ files = zip_ref.namelist()
+
+ # Parse trace files
+ for file in files:
+ if file.endswith('.trace'):
+ trace_data = zip_ref.read(file).decode('utf-8')
+ for line in trace_data.split('\n'):
+ if line.strip():
+ try:
+ event = json.loads(line)
+ self._process_event(event)
+ except json.JSONDecodeError:
+ self.errors.append(f"Failed to parse trace event: {line}")
+
+ # Parse network HAR if available
+ har_files = [f for f in files if f.endswith('.har')]
+ if har_files:
+ har_data = json.loads(zip_ref.read(har_files[0]).decode('utf-8'))
+ self._process_har(har_data)
+
+ except zipfile.BadZipFile:
+ raise ValueError(f"Invalid trace file format: {self.trace_path}")
+
+ def _process_event(self, event: Dict[str, Any]):
+ """Process a single trace event and categorize it."""
+ if 'type' not in event:
+ return
+
+ event_type = event['type']
+
+ if event_type == 'before' or event_type == 'after':
+ # Handle action events
+ if 'method' in event and 'params' in event:
+ self.actions.append({
+ 'type': event['method'],
+ 'timestamp': event.get('timestamp', 0),
+ 'duration': event.get('duration', 0),
+ 'params': event['params'],
+ 'success': event_type == 'after' and 'error' not in event,
+ 'error': event.get('error')
+ })
+ elif event_type == 'console':
+ # Handle console messages
+ if 'text' in event:
+ self.console_logs.append(event['text'])
+ elif event_type == 'error':
+ # Handle error events
+ if 'error' in event:
+ self.errors.append(event['error'].get('message', str(event['error'])))
+
+ def _process_har(self, har_data: Dict[str, Any]):
+ """Process HAR data to extract network requests."""
+ if 'log' in har_data and 'entries' in har_data['log']:
+ for entry in har_data['log']['entries']:
+ request = entry.get('request', {})
+ response = entry.get('response', {})
+
+ self.network_requests.append({
+ 'url': request.get('url'),
+ 'method': request.get('method'),
+ 'status': response.get('status'),
+ 'statusText': response.get('statusText'),
+ 'duration': entry.get('time'), # in milliseconds
+ 'failure': response.get('status', 0) >= 400
+ })
+
+async def analyze_trace(trace_path: str) -> dict:
+ """Parse a Playwright trace file and return structured data."""
+ trace = await PlaywrightTrace.parse(trace_path)
+ return {
+ "actions": trace.actions,
+ "network_requests": trace.network_requests,
+ "console_logs": trace.console_logs,
+ "errors": trace.errors,
+ "summary": {
+ "total_actions": len(trace.actions),
+ "failed_actions": sum(1 for a in trace.actions if not a['success']),
+ "total_requests": len(trace.network_requests),
+ "failed_requests": sum(1 for r in trace.network_requests if r.get('failure')),
+ "total_errors": len(trace.errors),
+ "error_summary": "\n".join(trace.errors) if trace.errors else "No errors"
+ }
+ }
+
+if __name__ == "__main__":
+ # Example usage
+ async def main():
+ result = await analyze_trace("path/to/trace.zip")
+ print(json.dumps(result, indent=2))
+
+ asyncio.run(main())
+
+class EnhancedTraceAnalyzer:
+ """Enhanced trace analyzer for detailed browser automation insights.
+
+ This class provides comprehensive analysis of browser automation traces, including:
+ - Action context and element states
+ - Decision-making processes and confidence levels
+ - Element identification and relationships
+ - Visual state changes and layout shifts
+ - Error recovery strategies
+ - Performance metrics and timing analysis
+
+ Example:
+ ```python
+ analyzer = EnhancedTraceAnalyzer("trace.zip")
+ result = await analyzer.analyze_all()
+
+ # Component-specific analysis
+ timing = await analyzer.analyze_timing()
+ visual = await analyzer.analyze_visual_state()
+ ```
+ """
+
+ def __init__(self, trace_file_path: str):
+ """Initialize the enhanced trace analyzer.
+
+ Args:
+ trace_file_path: Path to the trace file (ZIP format) containing enhanced trace data.
+ """
+ self.trace_file_path = trace_file_path
+ self._trace_data: Optional[Dict[str, Any]] = None
+
+ async def _load_trace_data(self) -> Dict[str, Any]:
+ """Load and validate enhanced trace data from the trace file.
+
+ Returns:
+ Dict containing the parsed trace data.
+
+ Raises:
+ ValueError: If the trace file is invalid or cannot be parsed.
+ """
+ if self._trace_data is None:
+ try:
+ trace_path = Path(self.trace_file_path)
+
+ # Handle nested directory structure
+ if trace_path.is_dir():
+ trace_zip = trace_path / 'trace.zip'
+ if trace_zip.is_dir():
+ trace_files = list(trace_zip.glob('*.zip'))
+ if not trace_files:
+ raise ValueError("No trace files found")
+ trace_path = trace_files[0]
+ else:
+ raise ValueError("Invalid trace directory structure")
+
+ # Parse Playwright trace
+ with zipfile.ZipFile(trace_path) as zf:
+ # Load trace data
+ with zf.open('trace.trace') as f:
+ trace_events = []
+ for line in f.read().decode('utf-8').splitlines():
+ if line.strip():
+ trace_events.append(json.loads(line))
+
+ # Load network data
+ with zf.open('trace.network') as f:
+ network_events = []
+ for line in f.read().decode('utf-8').splitlines():
+ if line.strip():
+ network_events.append(json.loads(line))
+
+ # Convert to enhanced trace format
+ self._trace_data = self._convert_playwright_trace(trace_events, network_events)
+
+ except Exception as e:
+ raise ValueError(f"Failed to load trace data: {str(e)}")
+
+ return self._trace_data
+
+ def _convert_playwright_trace(self, trace_events: List[Dict[str, Any]], network_events: List[Dict[str, Any]]) -> Dict[str, Any]:
+ """Convert Playwright trace format to enhanced trace format."""
+ # Extract metadata
+ metadata = {
+ "session_id": trace_events[0].get('sessionId', 'unknown'),
+ "timestamp": trace_events[0].get('timestamp', 0),
+ "browser_info": {
+ "viewport": next(
+ (e.get('params', {}).get('viewport') for e in trace_events
+ if e.get('method') == 'setViewportSize'),
+ {"width": 0, "height": 0}
+ ),
+ "user_agent": next(
+ (e.get('params', {}).get('userAgent') for e in trace_events
+ if e.get('method') == 'setUserAgent'),
+ "unknown"
+ )
+ }
+ }
+
+ # Extract steps
+ steps = []
+ current_step = None
+
+ for event in trace_events:
+ if event.get('type') == 'before':
+ if current_step:
+ steps.append(current_step)
+ current_step = {
+ "step_id": len(steps) + 1,
+ "action": event.get('method', 'unknown'),
+ "target": event.get('params', {}).get('selector', ''),
+ "timing": {
+ "start": event.get('timestamp', 0),
+ "end": None,
+ "duration": None
+ },
+ "status": "pending",
+ "error_context": None,
+ "visual_state": {
+ "screenshot_diffs": {},
+ "element_visibility": {},
+ "layout_shifts": []
+ },
+ "action_context": {
+ "element_state": event.get('params', {}),
+ "viewport_state": metadata['browser_info']['viewport']
+ }
+ }
+ elif event.get('type') == 'after' and current_step:
+ current_step['timing']['end'] = event.get('timestamp', 0)
+ current_step['timing']['duration'] = (
+ current_step['timing']['end'] - current_step['timing']['start']
+ )
+ current_step['status'] = 'error' if 'error' in event else 'success'
+ if 'error' in event:
+ current_step['error_context'] = {
+ "error_type": event['error'].get('name', 'unknown'),
+ "message": event['error'].get('message', ''),
+ "stack": event['error'].get('stack', '')
+ }
+
+ if current_step:
+ steps.append(current_step)
+
+ # Add network information
+ network_info = {
+ "requests": [
+ {
+ "url": event.get('params', {}).get('url'),
+ "method": event.get('params', {}).get('method'),
+ "status": event.get('params', {}).get('status'),
+ "timing": event.get('params', {}).get('timing')
+ }
+ for event in network_events
+ if event.get('method') == 'Network.responseReceived'
+ ]
+ }
+
+ return {
+ "metadata": metadata,
+ "steps": steps,
+ "network": network_info,
+ "performance": {
+ "navigation_timing": {
+ "dom_complete": next(
+ (e.get('timestamp', 0) for e in trace_events
+ if e.get('method') == 'domcontentloaded'),
+ 0
+ ),
+ "load_complete": next(
+ (e.get('timestamp', 0) for e in trace_events
+ if e.get('method') == 'load'),
+ 0
+ )
+ },
+ "interaction_timing": {
+ "time_to_first_interaction": next(
+ (e.get('timestamp', 0) for e in trace_events
+ if e.get('type') == 'before' and e.get('method') in ['click', 'fill']),
+ 0
+ ) - metadata['timestamp'],
+ "action_latency": sum(
+ step['timing']['duration'] for step in steps
+ if step['timing']['duration'] is not None
+ ) / len(steps) if steps else 0
+ }
+ }
+ }
+
+ async def analyze_action_context(self) -> Dict[str, Any]:
+ """Analyze the context of actions including before/after states."""
+ trace_data = await self._load_trace_data()
+ steps = trace_data["steps"]
+
+ return {
+ "steps": [
+ {
+ "step_id": step["step_id"],
+ "action": step["action"],
+ "target": step["target"],
+ "element_state": step["action_context"]["element_state"],
+ "viewport_state": step["action_context"]["viewport_state"]
+ }
+ for step in steps
+ ]
+ }
+
+ async def analyze_decision_trail(self) -> Dict[str, Any]:
+ """Analyze the decision making process and alternatives considered."""
+ trace_data = await self._load_trace_data()
+ steps = trace_data["steps"]
+
+ return {
+ "steps": [
+ {
+ "step_id": step["step_id"],
+ "action": step["action"],
+ "confidence": step["action_context"]["element_state"].get("confidence", 1.0),
+ "alternatives": step["action_context"]["element_state"].get("alternatives", []),
+ "reasoning": step["action_context"]["element_state"].get("reasoning", [])
+ }
+ for step in steps
+ ]
+ }
+
+ async def analyze_element_identification(self) -> Dict[str, Any]:
+ """Analyze methods used to identify elements."""
+ trace_data = await self._load_trace_data()
+ steps = trace_data["steps"]
+
+ return {
+ "steps": [
+ {
+ "step_id": step["step_id"],
+ "target": step["target"],
+ "selector": step["action_context"]["element_state"].get("selector", ""),
+ "position": step["action_context"]["element_state"].get("position", {}),
+ "relationships": step["action_context"]["element_state"].get("relationships", {})
+ }
+ for step in steps
+ ]
+ }
+
+ async def analyze_failures(self) -> Dict[str, Any]:
+ """Analyze failure scenarios and recovery attempts."""
+ trace_data = await self._load_trace_data()
+ steps = trace_data["steps"]
+ failed_steps = [step for step in steps if step["status"] == "error"]
+
+ return {
+ "failed_steps": [
+ {
+ "step_id": step["step_id"],
+ "action": step["action"],
+ "error": step["error_context"],
+ "recovery_attempts": step["action_context"]["element_state"].get("recovery_attempts", [])
+ }
+ for step in failed_steps
+ ],
+ "total_steps": len(steps),
+ "failed_steps_count": len(failed_steps)
+ }
+
+ async def analyze_session_context(self) -> Dict[str, Any]:
+ """Analyze session-wide context including navigation and network activity."""
+ trace_data = await self._load_trace_data()
+
+ return {
+ "metadata": trace_data["metadata"],
+ "network": trace_data["network"],
+ "performance": trace_data["performance"]
+ }
+
+ async def analyze_recovery_info(self) -> Dict[str, Any]:
+ """Analyze recovery information and checkpoints."""
+ trace_data = await self._load_trace_data()
+ steps = trace_data["steps"]
+ recovery_steps = [
+ step for step in steps
+ if step["status"] == "error" and step["action_context"]["element_state"].get("recovery_attempts")
+ ]
+
+ return {
+ "recovery_steps": [
+ {
+ "step_id": step["step_id"],
+ "action": step["action"],
+ "recovery_attempts": step["action_context"]["element_state"]["recovery_attempts"],
+ "final_status": "recovered" if any(
+ attempt.get("success")
+ for attempt in step["action_context"]["element_state"].get("recovery_attempts", [])
+ ) else "failed"
+ }
+ for step in recovery_steps
+ ]
+ }
+
+ async def analyze_model_data(self) -> Dict[str, Any]:
+ """Analyze model-specific data including token usage and vision analysis."""
+ trace_data = await self._load_trace_data()
+ steps = trace_data["steps"]
+
+ return {
+ "steps": [
+ {
+ "step_id": step["step_id"],
+ "action": step["action"],
+ "model_info": step["action_context"]["element_state"].get("model_info", {}),
+ "vision_analysis": step["action_context"]["element_state"].get("vision_analysis", {})
+ }
+ for step in steps
+ ]
+ }
+
+ async def analyze_temporal_context(self) -> Dict[str, Any]:
+ """Analyze temporal information including timing and wait conditions."""
+ trace_data = await self._load_trace_data()
+ steps = trace_data["steps"]
+
+ return {
+ "steps": [
+ {
+ "step_id": step["step_id"],
+ "timing": step["timing"],
+ "wait_conditions": step["action_context"]["element_state"].get("wait_conditions", [])
+ }
+ for step in steps
+ ],
+ "total_duration": sum(
+ step["timing"]["duration"] for step in steps
+ if step["timing"]["duration"] is not None
+ )
+ }
+
+ async def analyze_element_reporting(self) -> Dict[str, Any]:
+ """Analyze enhanced element reporting with detailed selection context."""
+ trace_data = await self._load_trace_data()
+ steps = trace_data["steps"]
+
+ return {
+ "steps": [
+ {
+ "step_id": step["step_id"],
+ "action": step["action"],
+ "target": step["target"],
+ "element_state": step["action_context"]["element_state"],
+ "status": step["status"]
+ }
+ for step in steps
+ ]
+ }
+
+ async def analyze_error_context(self) -> Dict[str, Any]:
+ """Analyze error context and session state information."""
+ trace_data = await self._load_trace_data()
+ steps = trace_data["steps"]
+ error_steps = [step for step in steps if step["status"] == "error"]
+
+ return {
+ "error_steps": [
+ {
+ "step_id": step["step_id"],
+ "action": step["action"],
+ "error_context": step["error_context"],
+ "session_state": {
+ "url": trace_data["metadata"]["browser_info"].get("url"),
+ "viewport": trace_data["metadata"]["browser_info"]["viewport"],
+ "network_status": any(
+ req["status"] >= 400
+ for req in trace_data["network"]["requests"]
+ )
+ }
+ }
+ for step in error_steps
+ ]
+ }
+
+ async def analyze_timing(self) -> Dict[str, Any]:
+ """Analyze detailed interaction timing information."""
+ trace_data = await self._load_trace_data()
+ steps = trace_data["steps"]
+
+ return {
+ "steps": [
+ {
+ "step_id": step["step_id"],
+ "action": step["action"],
+ "timing": {
+ "start": step["timing"]["start"],
+ "end": step["timing"]["end"],
+ "duration": step["timing"]["duration"]
+ }
+ }
+ for step in steps
+ if step["timing"]["duration"] is not None
+ ],
+ "performance": trace_data["performance"],
+ "summary": {
+ "total_duration": sum(
+ step["timing"]["duration"] for step in steps
+ if step["timing"]["duration"] is not None
+ ),
+ "average_step_duration": sum(
+ step["timing"]["duration"] for step in steps
+ if step["timing"]["duration"] is not None
+ ) / len([s for s in steps if s["timing"]["duration"] is not None])
+ }
+ }
+
+ async def analyze_visual_state(self) -> Dict[str, Any]:
+ """Analyze visual state changes with enhanced tracking."""
+ trace_data = await self._load_trace_data()
+ steps = trace_data.get("steps", [])
+
+ visual_analysis = []
+ for step in steps:
+ visual_state = step.get("visual_state", {})
+ visual_analysis.append({
+ "step_id": step["step_id"],
+ "before_action": {
+ "screenshot": visual_state.get("screenshot_diffs", {}).get("before"),
+ "visible_elements": visual_state.get("element_visibility", {}).get("before", [])
+ },
+ "after_action": {
+ "screenshot": visual_state.get("screenshot_diffs", {}).get("after"),
+ "visible_elements": visual_state.get("element_visibility", {}).get("after", []),
+ "added_elements": visual_state.get("element_visibility", {}).get("added", []),
+ "removed_elements": visual_state.get("element_visibility", {}).get("removed", [])
+ },
+ "layout_shifts": visual_state.get("layout_shifts", [])
+ })
+
+ return {
+ "visual_changes": visual_analysis,
+ "cumulative_layout_shift": sum(
+ shift.get("cumulative_layout_shift", 0)
+ for step in visual_analysis
+ for shift in step.get("layout_shifts", [])
+ )
+ }
+
+ async def analyze_error_recovery(self) -> Dict[str, Any]:
+ """Analyze enhanced error recovery capabilities with improved context."""
+ trace_data = await self._load_trace_data()
+ steps = trace_data.get("steps", [])
+ error_steps = [step for step in steps if step.get("status") == "error"]
+
+ recovery_analysis = []
+ for step in error_steps:
+ error_ctx = step.get("error_context", {})
+ recovery_analysis.append({
+ "step_id": step["step_id"],
+ "error_type": error_ctx.get("error_type", "unknown"),
+ "target_element": {
+ "selector": error_ctx.get("target_element", {}).get("selector"),
+ "visible_similar_elements": error_ctx.get("target_element", {}).get("visible_similar_elements", [])
+ },
+ "recovery_attempts": error_ctx.get("recovery_attempts", []),
+ "environment_factors": error_ctx.get("environment_factors", {})
+ })
+
+ return {
+ "error_steps": recovery_analysis,
+ "recovery_success_rate": len([r for r in recovery_analysis if any(
+ attempt["outcome"] == "success" for attempt in r["recovery_attempts"]
+ )]) / len(recovery_analysis) if recovery_analysis else 1.0
+ }
+
+ async def analyze_performance(self) -> Dict[str, Any]:
+ """Analyze performance metrics including navigation and interaction timing."""
+ trace_data = await self._load_trace_data()
+ performance = trace_data.get("performance", {})
+
+ return {
+ "navigation_timing": performance.get("navigation_timing", {}),
+ "interaction_timing": performance.get("interaction_timing", {}),
+ "metrics_summary": {
+ "avg_action_latency": performance.get("interaction_timing", {}).get("action_latency", 0),
+ "total_interaction_time": sum(
+ step.get("timing", {}).get("duration", 0)
+ for step in trace_data.get("steps", [])
+ )
+ }
+ }
+
+ async def analyze_all(self) -> Dict[str, Any]:
+ """Perform comprehensive analysis of all trace components.
+
+ Returns:
+ Dict containing analysis results from all components:
+ - action_context: Action and element state analysis
+ - decision_trail: Decision-making process analysis
+ - element_identification: Element location and relationships
+ - failure_analysis: Failure scenarios and recovery attempts
+ - session_context: Session-wide context and navigation
+ - recovery_info: Recovery strategies and checkpoints
+ - model_data: Model-specific data and vision analysis
+ - temporal_context: Timing and sequence information
+ - element_reporting: Enhanced element selection reporting
+ - error_context: Error handling and recovery context
+ - timing_analysis: Detailed timing breakdown
+ - visual_state: Visual changes and layout analysis
+ - error_recovery: Enhanced error recovery capabilities
+ - performance: Performance metrics and timing analysis
+ """
+ trace_data = await self._load_trace_data()
+
+ return {
+ "action_context": await self.analyze_action_context(),
+ "decision_trail": await self.analyze_decision_trail(),
+ "element_identification": await self.analyze_element_identification(),
+ "failure_analysis": await self.analyze_failures(),
+ "session_context": await self.analyze_session_context(),
+ "recovery_info": await self.analyze_recovery_info(),
+ "model_data": await self.analyze_model_data(),
+ "temporal_context": await self.analyze_temporal_context(),
+ "element_reporting": await self.analyze_element_reporting(),
+ "error_context": await self.analyze_error_context(),
+ "timing_analysis": await self.analyze_timing(),
+ "visual_state": await self.analyze_visual_state(),
+ "error_recovery": await self.analyze_error_recovery(),
+ "performance": await self.analyze_performance()
+ }
\ No newline at end of file
diff --git a/src/utils/browser_controller.py b/src/utils/browser_controller.py
new file mode 100644
index 00000000..2171574b
--- /dev/null
+++ b/src/utils/browser_controller.py
@@ -0,0 +1,141 @@
+from typing import Optional, Any
+import asyncio
+from playwright.async_api import async_playwright, Browser, Playwright
+from .structured_logging import StructuredLogger, setup_structured_logging
+
+class BrowserController:
+ def __init__(self):
+ self.browser: Optional[Browser] = None
+ self.init_promise: Optional[asyncio.Task] = None
+ self.init_count: int = 0
+ self._playwright: Optional[Playwright] = None
+ self.logger = StructuredLogger("browser_controller")
+ setup_structured_logging()
+
+ async def initialize(self) -> None:
+ """Initialize the browser if not already initialized."""
+ if self.init_promise is not None:
+ try:
+ await self.init_promise
+ except Exception as e:
+ # If the current initialization fails, reset state to allow retry
+ self.init_promise = None
+ self.browser = None
+ self.logger.log_browser_event("initialization_failed", {
+ "error": str(e),
+ "attempt": self.init_count + 1
+ })
+ raise
+
+ if self.browser is not None:
+ return
+
+ # Create new initialization task
+ self.logger.log_progress(
+ step="browser_init",
+ status="starting",
+ progress=0.0,
+ message="Starting browser initialization"
+ )
+ self.init_promise = asyncio.create_task(self._do_browser_init())
+ try:
+ await self.init_promise
+ self.logger.log_progress(
+ step="browser_init",
+ status="completed",
+ progress=1.0,
+ message="Browser initialization completed"
+ )
+ except Exception as e:
+ # Reset state on failure
+ self.init_promise = None
+ self.browser = None
+ self.logger.log_progress(
+ step="browser_init",
+ status="failed",
+ progress=0.0,
+ message=f"Browser initialization failed: {str(e)}"
+ )
+ raise
+
+ async def _do_browser_init(self) -> None:
+ """Internal method to handle browser initialization."""
+ if self.browser is not None:
+ return
+
+ self.logger.log_progress(
+ step="browser_init",
+ status="launching",
+ progress=0.3,
+ message="Launching Playwright"
+ )
+ playwright = await async_playwright().start()
+ self._playwright = playwright
+
+ try:
+ self.logger.log_progress(
+ step="browser_init",
+ status="configuring",
+ progress=0.6,
+ message="Configuring browser"
+ )
+ self.browser = await playwright.chromium.launch(
+ headless=True,
+ args=['--no-sandbox']
+ )
+ self.init_count += 1
+
+ self.logger.log_browser_event("browser_launched", {
+ "initialization_count": self.init_count,
+ "headless": True
+ })
+
+ except Exception as e:
+ await self._cleanup_playwright()
+ self.logger.log_browser_event("launch_failed", {
+ "error": str(e),
+ "initialization_count": self.init_count
+ })
+ raise
+
+ async def _cleanup_playwright(self) -> None:
+ """Clean up the playwright context."""
+ if self._playwright:
+ self.logger.log_browser_event("cleanup_playwright", {
+ "status": "starting"
+ })
+ await self._playwright.stop()
+ self._playwright = None
+ self.logger.log_browser_event("cleanup_playwright", {
+ "status": "completed"
+ })
+
+ async def cleanup(self) -> None:
+ """Clean up browser resources."""
+ self.logger.log_progress(
+ step="cleanup",
+ status="starting",
+ progress=0.0,
+ message="Starting browser cleanup"
+ )
+
+ if self.browser:
+ self.logger.log_progress(
+ step="cleanup",
+ status="closing_browser",
+ progress=0.5,
+ message="Closing browser"
+ )
+ await self.browser.close()
+ self.browser = None
+
+ await self._cleanup_playwright()
+ self.init_promise = None
+ self.init_count = 0
+
+ self.logger.log_progress(
+ step="cleanup",
+ status="completed",
+ progress=1.0,
+ message="Browser cleanup completed"
+ )
\ No newline at end of file
diff --git a/src/utils/error_handling.py b/src/utils/error_handling.py
new file mode 100644
index 00000000..2a4f744c
--- /dev/null
+++ b/src/utils/error_handling.py
@@ -0,0 +1,53 @@
+import asyncio
+from datetime import datetime
+from typing import Dict, Any, Optional
+import re
+
+class MaxRetriesExceededError(Exception):
+ def __init__(self, operation: str, original_error: Exception):
+ self.operation = operation
+ self.original_error = original_error
+ super().__init__(f"Max retries exceeded for operation '{operation}': {str(original_error)}")
+
+class ErrorHandler:
+ MAX_RETRIES = 3
+
+ def __init__(self):
+ self._retry_counts: Dict[str, int] = {}
+ self._last_error: Optional[Dict[str, Any]] = None
+
+ async def handle_error(self, error: Exception, operation: str) -> None:
+ retry_count = self._retry_counts.get(operation, 0)
+
+ if retry_count >= self.MAX_RETRIES:
+ raise MaxRetriesExceededError(operation, error)
+
+ self._retry_counts[operation] = retry_count + 1
+ await self._log_error(error, operation, retry_count)
+
+ # Exponential backoff: 2^retry_count seconds
+ await asyncio.sleep(2 ** retry_count)
+
+ async def _log_error(self, error: Exception, operation: str, retry_count: int) -> None:
+ error_context = {
+ "operation": operation,
+ "attempt": retry_count + 1,
+ "timestamp": datetime.now().isoformat(),
+ "error": {
+ "name": error.__class__.__name__,
+ "message": str(error),
+ "code": self.extract_error_code(error)
+ }
+ }
+
+ self._last_error = error_context
+ # In a real implementation, we would log to a file or logging service
+ print(f"Error: {error_context}")
+
+ def extract_error_code(self, error: Exception) -> str:
+ error_message = str(error)
+ match = re.search(r'ERR_[A-Z_]+', error_message)
+ return match.group(0) if match else "UNKNOWN_ERROR"
+
+ def get_last_error(self) -> Optional[Dict[str, Any]]:
+ return self._last_error
\ No newline at end of file
diff --git a/src/utils/logging.py b/src/utils/logging.py
new file mode 100644
index 00000000..982ffdc2
--- /dev/null
+++ b/src/utils/logging.py
@@ -0,0 +1,158 @@
+import json
+import logging
+import datetime
+from typing import Any, Dict, List, Optional
+from enum import Enum
+import traceback
+import types
+
+class LogLevel(str, Enum):
+ CRITICAL = "CRITICAL"
+ ERROR = "ERROR"
+ WARNING = "WARNING"
+ INFO = "INFO"
+ DEBUG = "DEBUG"
+ TRACE = "TRACE"
+
+class LogJSONEncoder(json.JSONEncoder):
+ def default(self, obj):
+ if isinstance(obj, Exception):
+ return {
+ 'type': obj.__class__.__name__,
+ 'message': str(obj),
+ 'traceback': traceback.format_exception(type(obj), obj, obj.__traceback__)
+ }
+ if isinstance(obj, type):
+ return obj.__name__
+ if isinstance(obj, types.TracebackType):
+ return traceback.format_tb(obj)
+ return super().default(obj)
+
+class LogFormatter(logging.Formatter):
+ def __init__(self, use_json: bool = True):
+ super().__init__()
+ self.use_json = use_json
+ self._event_counter: Dict[str, int] = {}
+
+ def _serialize_error(self, exc_info) -> Dict[str, str]:
+ """Serialize error information into a dictionary."""
+ exc_type, exc_value, exc_tb = exc_info
+ return {
+ "type": exc_type.__name__ if exc_type else "Unknown",
+ "message": str(exc_value) if exc_value else "",
+ "stack_trace": self.formatException(exc_info) if exc_tb else ""
+ }
+
+ def format(self, record: logging.LogRecord) -> str:
+ timestamp = datetime.datetime.fromtimestamp(record.created).strftime("%Y-%m-%dT%H:%M:%S")
+
+ # Extract additional fields if they exist
+ extra_fields = {}
+ for key, value in vars(record).items():
+ if key not in logging.LogRecord.__dict__ and not key.startswith('_'):
+ extra_fields[key] = value
+
+ if self.use_json:
+ log_entry = {
+ "timestamp": timestamp,
+ "level": record.levelname,
+ "logger": record.name or "root",
+ "message": record.getMessage(),
+ **extra_fields
+ }
+
+ if hasattr(record, 'event_type'):
+ log_entry["event_type"] = getattr(record, 'event_type')
+
+ if hasattr(record, 'event_data'):
+ log_entry["data"] = getattr(record, 'event_data')
+
+ if record.exc_info and record.levelno >= logging.ERROR:
+ log_entry["error"] = self._serialize_error(record.exc_info)
+
+ return json.dumps(log_entry, cls=LogJSONEncoder)
+ else:
+ # Compact format for non-JSON logs
+ basic_msg = f"[{timestamp}] {record.levelname[0]}: {record.getMessage()}"
+
+ if record.exc_info and record.levelno >= logging.ERROR:
+ return f"{basic_msg}\n{self.formatException(record.exc_info)}"
+
+ return basic_msg
+
+class BatchedEventLogger:
+ def __init__(self, logger: logging.Logger):
+ self._logger = logger
+ self._batched_events: Dict[str, List[Dict[str, Any]]] = {}
+
+ def add_event(self, event_type: str, event_data: Dict[str, Any]) -> None:
+ if event_type not in self._batched_events:
+ self._batched_events[event_type] = []
+ self._batched_events[event_type].append(event_data)
+
+ def flush(self) -> None:
+ for event_type, events in self._batched_events.items():
+ if events:
+ self._logger.info(
+ f"Batch: {len(events)} {event_type} events",
+ extra={
+ "event_type": f"batched_{event_type}",
+ "event_data": {
+ "count": len(events),
+ "events": events
+ }
+ }
+ )
+ self._batched_events.clear()
+
+def setup_logging(
+ level: str = "INFO",
+ use_json: bool = True,
+ log_file: Optional[str] = None,
+ exclude_patterns: Optional[List[str]] = None
+) -> None:
+ """
+ Setup logging configuration with the improved formatter
+
+ Args:
+ level: The logging level to use
+ use_json: Whether to use JSON formatting
+ log_file: Optional file to write logs to
+ exclude_patterns: Optional list of patterns to exclude from logging
+ """
+ root_logger = logging.getLogger()
+ root_logger.setLevel(level)
+
+ # Clear any existing handlers
+ root_logger.handlers.clear()
+
+ # Create console handler
+ console_handler = logging.StreamHandler()
+ console_handler.setFormatter(LogFormatter(use_json=use_json))
+
+ if exclude_patterns:
+ class ExcludeFilter(logging.Filter):
+ def filter(self, record: logging.LogRecord) -> bool:
+ return not any(pattern in record.getMessage() for pattern in exclude_patterns)
+
+ console_handler.addFilter(ExcludeFilter())
+
+ root_logger.addHandler(console_handler)
+
+ # Add file handler if specified
+ if log_file:
+ file_handler = logging.FileHandler(log_file)
+ file_handler.setFormatter(LogFormatter(use_json=True)) # Always use JSON for file logging
+ if exclude_patterns:
+ file_handler.addFilter(ExcludeFilter())
+ root_logger.addHandler(file_handler)
+
+# Production filter patterns
+PRODUCTION_EXCLUDE_PATTERNS = [
+ "deprecated",
+ "virtual environment",
+ "Activating virtual environment",
+ "✅ Eval: Success",
+ "🤔 Thought:",
+ "VIRTUAL_ENV:"
+]
\ No newline at end of file
diff --git a/src/utils/structured_logging.py b/src/utils/structured_logging.py
new file mode 100644
index 00000000..8568de23
--- /dev/null
+++ b/src/utils/structured_logging.py
@@ -0,0 +1,223 @@
+from typing import Optional, Dict, Any, List
+import logging
+import json
+from dataclasses import dataclass, asdict
+from datetime import datetime
+from colorama import init, Fore, Style
+import os
+
+# Initialize colorama
+init()
+
+@dataclass
+class ColorScheme:
+ """Color scheme for different log elements."""
+ ERROR: str = Fore.RED
+ WARNING: str = Fore.YELLOW
+ INFO: str = Fore.CYAN
+ DEBUG: str = Style.DIM
+ TIMESTAMP: str = Fore.WHITE
+ SUCCESS: str = Fore.GREEN
+ STEP: str = Fore.BLUE
+ RESET: str = Style.RESET_ALL
+
+class ColorizedFormatter(logging.Formatter):
+ """Formatter that adds colors to log output."""
+
+ def __init__(self, use_colors: bool = True):
+ super().__init__()
+ self.use_colors = use_colors and not os.getenv('NO_COLOR')
+ self.colors = ColorScheme()
+
+ def colorize(self, text: str, color: str) -> str:
+ """Add color to text if colors are enabled."""
+ if self.use_colors:
+ return f"{color}{text}{self.colors.RESET}"
+ return text
+
+ def format(self, record: logging.LogRecord) -> str:
+ """Format the log record with colors."""
+ # Get the appropriate color for the log level
+ level_color = getattr(self.colors, record.levelname, self.colors.INFO)
+
+ # Format timestamp
+ timestamp = self.colorize(
+ datetime.utcnow().strftime("%H:%M:%S"),
+ self.colors.TIMESTAMP
+ )
+
+ # Format level
+ level = self.colorize(record.levelname, level_color)
+
+ # Format message and handle special keywords
+ msg = record.getMessage()
+ if "✓" in msg:
+ msg = msg.replace("✓", self.colorize("✓", self.colors.SUCCESS))
+ if "×" in msg:
+ msg = msg.replace("×", self.colorize("×", self.colors.ERROR))
+ if "STEP" in msg:
+ msg = msg.replace("STEP", self.colorize("STEP", self.colors.STEP))
+
+ # Build the basic log message
+ log_message = f"[{timestamp}] {level} {msg}"
+
+ # Add structured data if available
+ if hasattr(record, 'event_type'):
+ event_type = self.colorize(record.event_type, self.colors.INFO)
+ if hasattr(record, 'data'):
+ # Format the data as JSON but don't colorize it
+ data_str = json.dumps(record.data, indent=2)
+ log_message = f"{log_message} | {event_type} | {data_str}"
+
+ return log_message
+
+class JSONFormatter(logging.Formatter):
+ """Custom JSON formatter for structured logs."""
+
+ def format(self, record: logging.LogRecord) -> str:
+ """Format the log record as a JSON string."""
+ output = {
+ "timestamp": datetime.utcnow().isoformat(),
+ "level": record.levelname,
+ "message": record.getMessage(),
+ "logger": record.name
+ }
+
+ # Add extra fields from record.__dict__ to handle custom attributes
+ if hasattr(record, '__dict__'):
+ for key, value in record.__dict__.items():
+ if key not in output and key not in ('args', 'exc_info', 'exc_text', 'msg'):
+ output[key] = value
+
+ return json.dumps(output)
+
+def setup_structured_logging(level: int = logging.INFO, use_colors: bool = True, json_output: bool = False) -> None:
+ """Set up structured logging with optional colorized output."""
+ root_logger = logging.getLogger()
+ root_logger.setLevel(level)
+
+ # Remove existing handlers
+ for handler in root_logger.handlers[:]:
+ root_logger.removeHandler(handler)
+
+ # Create console handler with appropriate formatter
+ handler = logging.StreamHandler()
+ if json_output:
+ handler.setFormatter(JSONFormatter())
+ else:
+ handler.setFormatter(ColorizedFormatter(use_colors=use_colors))
+
+ root_logger.addHandler(handler)
+
+@dataclass
+class ProgressEvent:
+ """Represents a progress update in the browser automation process."""
+ step: str
+ status: str
+ progress: float # 0.0 to 1.0
+ message: str
+ timestamp: Optional[str] = None
+
+ def __post_init__(self):
+ if self.timestamp is None:
+ self.timestamp = datetime.utcnow().isoformat()
+
+@dataclass
+class BrowserEvent:
+ """Represents a browser-related event."""
+ event_type: str
+ details: Dict[str, Any]
+ timestamp: Optional[str] = None
+
+ def __post_init__(self):
+ if self.timestamp is None:
+ self.timestamp = datetime.utcnow().isoformat()
+
+class StructuredLogger:
+ """Handles structured logging with progress reporting and feedback."""
+
+ def __init__(self, logger_name: str = "browser_automation"):
+ self.logger = logging.getLogger(logger_name)
+ self.progress_events: List[ProgressEvent] = []
+ self.browser_events: List[BrowserEvent] = []
+ self._current_progress: float = 0.0
+
+ def log_progress(self, step: str, status: str, progress: float, message: str) -> None:
+ """Log a progress update."""
+ event = ProgressEvent(step=step, status=status, progress=progress, message=message)
+ self.progress_events.append(event)
+ self._current_progress = progress
+
+ self.logger.info("Progress Update", extra={
+ "event_type": "progress",
+ "data": asdict(event)
+ })
+
+ def log_browser_event(self, event_type: str, details: Dict[str, Any]) -> None:
+ """Log a browser-related event."""
+ event = BrowserEvent(event_type=event_type, details=details)
+ self.browser_events.append(event)
+
+ self.logger.info(f"Browser Event: {event_type}", extra={
+ "event_type": "browser",
+ "data": asdict(event)
+ })
+
+ def get_current_progress(self) -> float:
+ """Get the current progress as a float between 0 and 1."""
+ return self._current_progress
+
+ def get_progress_history(self) -> List[Dict[str, Any]]:
+ """Get the history of progress events."""
+ return [asdict(event) for event in self.progress_events]
+
+ def get_browser_events(self) -> List[Dict[str, Any]]:
+ """Get all browser events."""
+ return [asdict(event) for event in self.browser_events]
+
+ def clear_history(self) -> None:
+ """Clear all stored events."""
+ self.progress_events.clear()
+ self.browser_events.clear()
+ self._current_progress = 0.0
+
+class EventBatcher:
+ def __init__(self, batch_size: int = 5):
+ self.events: List[BrowserEvent] = []
+ self.batch_size = max(1, batch_size) # Ensure minimum batch size of 1
+
+ def add_event(self, event: BrowserEvent) -> Optional[Dict[str, Any]]:
+ self.events.append(event)
+ if len(self.events) >= self.batch_size:
+ return self.flush_events()
+ return None
+
+ def flush_events(self) -> Dict[str, Any]:
+ if not self.events:
+ return {
+ "timestamp": datetime.now().isoformat(),
+ "total_events": 0,
+ "success_count": 0,
+ "error_count": 0,
+ "duration_ms": 0
+ }
+
+ summary = {
+ "timestamp": datetime.now().isoformat(),
+ "total_events": len(self.events),
+ "success_count": sum(1 for e in self.events if e.get_status() == "success"),
+ "error_count": sum(1 for e in self.events if e.get_status() == "failed"),
+ "duration_ms": self._calculate_total_duration()
+ }
+ self.events = []
+ return summary
+
+ def get_event_count(self) -> int:
+ return len(self.events)
+
+ def _calculate_total_duration(self) -> int:
+ total_duration = 0
+ for event in self.events:
+ if event.metrics and "duration_ms" in event.metrics:
+ total_duration += event.metrics["duration_ms"]
+ return total_duration
\ No newline at end of file
diff --git a/src/utils/task_logging.py b/src/utils/task_logging.py
new file mode 100644
index 00000000..908774a2
--- /dev/null
+++ b/src/utils/task_logging.py
@@ -0,0 +1,562 @@
+from typing import Dict, Any, List, Literal, Optional, Union, Callable, TypeVar, Awaitable
+from dataclasses import dataclass, asdict, field
+from datetime import datetime
+import json
+from enum import Enum
+import traceback
+import asyncio
+import random
+import os
+from colorama import init, Fore, Style
+
+# Initialize colorama for cross-platform color support
+init()
+
+# Define generic type parameter at module level
+T = TypeVar('T')
+
+class TaskStatus(str, Enum):
+ PENDING = "pending"
+ RUNNING = "running"
+ COMPLETE = "complete"
+ FAILED = "failed"
+
+class ActionType(str, Enum):
+ NAVIGATION = "navigation"
+ INTERACTION = "interaction"
+ EXTRACTION = "extraction"
+ VALIDATION = "validation"
+ RECOVERY = "recovery"
+
+ @property
+ def emoji(self) -> str:
+ """Get the emoji representation of the action type."""
+ return {
+ ActionType.NAVIGATION: "🌐",
+ ActionType.INTERACTION: "🖱️",
+ ActionType.EXTRACTION: "📑",
+ ActionType.VALIDATION: "✅",
+ ActionType.RECOVERY: "🔄"
+ }[self]
+
+@dataclass
+class PerformanceMetrics:
+ """Performance metrics for task execution."""
+ total_duration: float = 0.0
+ step_breakdown: Dict[str, float] = field(default_factory=dict)
+
+ def add_step_duration(self, step_type: str, duration: float) -> None:
+ """Add duration for a step type."""
+ if step_type not in self.step_breakdown:
+ self.step_breakdown[step_type] = 0
+ self.step_breakdown[step_type] += duration
+ self.total_duration += duration
+
+ def to_dict(self) -> Dict[str, Any]:
+ """Convert metrics to a dictionary."""
+ return {
+ "total_duration": self.total_duration,
+ "step_breakdown": self.step_breakdown
+ }
+
+@dataclass
+class ErrorInfo:
+ """Information about an error that occurred."""
+ type: str
+ message: str
+ step: int
+ action: str
+ traceback: Optional[str] = None
+
+@dataclass
+class StepInfo:
+ """Information about the current step in a task."""
+ number: int
+ description: str
+ started_at: str
+ status: Union[TaskStatus, str]
+ duration: Optional[float] = None
+ progress: Optional[float] = None
+ action_type: Optional[ActionType] = None
+ context: Optional[Dict[str, Any]] = None
+ results: Optional[Dict[str, Any]] = None
+ suppress_similar: bool = False
+
+ def __post_init__(self):
+ if isinstance(self.status, str):
+ self.status = TaskStatus(self.status)
+ if isinstance(self.action_type, str):
+ self.action_type = ActionType(self.action_type)
+
+ @property
+ def status_value(self) -> str:
+ """Get the string value of the status."""
+ return self.status.value if isinstance(self.status, TaskStatus) else str(self.status)
+
+@dataclass
+class BrowserState:
+ """Current state of the browser."""
+ url: str
+ page_ready: bool
+ dynamic_content_loaded: bool
+ visible_elements: int
+ current_frame: Optional[str] = None
+ active_element: Optional[str] = None
+ page_title: Optional[str] = None
+
+@dataclass
+class RetryConfig:
+ """Configuration for retry behavior."""
+ max_retries: int = 3
+ base_delay: float = 1.0
+ max_delay: float = 10.0
+ jitter: float = 0.1
+
+ def get_delay(self, attempt: int) -> float:
+ """Calculate delay for a given attempt using exponential backoff."""
+ if attempt == 0:
+ return 0
+ if attempt > self.max_retries:
+ return -1
+
+ # Calculate exponential delay
+ delay = self.base_delay * (2 ** (attempt - 1))
+ delay = min(delay, self.max_delay)
+
+ # Add jitter if configured
+ if self.jitter > 0:
+ jitter_range = delay * self.jitter
+ delay += random.uniform(-jitter_range/2, jitter_range/2)
+
+ return max(0, delay)
+
+@dataclass
+class RetryInfo:
+ """Information about retry attempts."""
+ attempts: int = 0
+ success: bool = False
+ history: List[Dict[str, Any]] = field(default_factory=list)
+
+@dataclass
+class TaskContext:
+ """Context information for a task."""
+ id: str
+ goal: str
+ current_step: StepInfo
+ browser_state: BrowserState
+ started_at: Optional[str] = None
+ error: Optional[ErrorInfo] = None
+ performance: Optional[PerformanceMetrics] = None
+ log_history: List[StepInfo] = field(default_factory=list)
+ retries: Optional[RetryInfo] = None
+
+ def __post_init__(self):
+ if self.started_at is None:
+ self.started_at = datetime.utcnow().isoformat()
+ if self.performance is None:
+ self.performance = PerformanceMetrics()
+ if self.retries is None:
+ self.retries = RetryInfo()
+
+ def to_dict(self) -> Dict[str, Any]:
+ """Convert the context to a dictionary for logging."""
+ result = {
+ "timestamp": datetime.utcnow().isoformat(),
+ "task": {
+ "id": self.id,
+ "goal": self.goal,
+ "progress": self._format_progress(),
+ "elapsed_time": self._calculate_elapsed_time(),
+ "status": self.current_step.status_value
+ }
+ }
+
+ # Add retry information if available
+ if self.retries and self.retries.attempts > 0:
+ result["task"]["retries"] = {
+ "attempts": self.retries.attempts,
+ "success": self.retries.success,
+ "history": self.retries.history
+ }
+
+ # Add current action information
+ if self.current_step.action_type:
+ result["task"]["current_action"] = self.current_step.action_type.value
+ if self.current_step.context:
+ result["task"]["action_context"] = self.current_step.context
+ if self.current_step.results:
+ result["task"]["action_results"] = self.current_step.results
+
+ # Add browser state
+ result["browser"] = {
+ "url": self.browser_state.url,
+ "state": "ready" if self.browser_state.page_ready else "loading",
+ "visible_elements": self.browser_state.visible_elements,
+ "dynamic_content": "loaded" if self.browser_state.dynamic_content_loaded else "loading"
+ }
+
+ if self.browser_state.current_frame:
+ result["browser"]["current_frame"] = self.browser_state.current_frame
+ if self.browser_state.active_element:
+ result["browser"]["active_element"] = self.browser_state.active_element
+ if self.browser_state.page_title:
+ result["browser"]["page_title"] = self.browser_state.page_title
+
+ if self.error:
+ result["error"] = {
+ "type": self.error.type,
+ "message": self.error.message,
+ "step": self.error.step,
+ "action": self.error.action
+ }
+ if self.error.traceback:
+ result["error"]["traceback"] = self.error.traceback
+
+ if self.performance and self.performance.step_breakdown:
+ result["performance"] = self.performance.to_dict()
+
+ return result
+
+ def _format_progress(self) -> str:
+ """Format the progress information."""
+ if self.current_step.progress is not None:
+ return f"{int(self.current_step.progress * 100)}%"
+ return f"{self.current_step.number}/unknown steps"
+
+ def _calculate_elapsed_time(self) -> str:
+ """Calculate the elapsed time since task start."""
+ if self.started_at is None:
+ return "0.0s"
+ start = datetime.fromisoformat(self.started_at)
+ elapsed = datetime.utcnow() - start
+ return f"{elapsed.total_seconds():.1f}s"
+
+@dataclass
+class ColorScheme:
+ """Color scheme for log messages."""
+ error: str = Fore.RED
+ warning: str = Fore.YELLOW
+ info: str = Fore.CYAN
+ success: str = Fore.GREEN
+ reset: str = Style.RESET_ALL
+
+ @property
+ def enabled(self) -> bool:
+ """Check if colors should be enabled."""
+ return not bool(os.getenv("NO_COLOR"))
+
+ def apply(self, text: str, color: str) -> str:
+ """Apply color to text if colors are enabled."""
+ if not self.enabled:
+ return text
+ return f"{color}{text}{self.reset}"
+
+class LogFormatter:
+ """Formatter for log messages with color support."""
+
+ def __init__(self, color_scheme: Optional[ColorScheme] = None):
+ self.colors = color_scheme or ColorScheme()
+
+ def format(self, record: Any) -> str:
+ """Format a log record with appropriate colors."""
+ level_colors = {
+ "ERROR": self.colors.error,
+ "WARNING": self.colors.warning,
+ "INFO": self.colors.info
+ }
+
+ # Format timestamp
+ timestamp = datetime.fromtimestamp(record.created).strftime("%Y-%m-%d %H:%M:%S")
+
+ # Color the level name
+ level_color = level_colors.get(record.levelname, self.colors.info)
+ colored_level = self.colors.apply(record.levelname, level_color)
+
+ return f"[{timestamp}] {colored_level}: {record.msg}"
+
+@dataclass
+class SeparatorStyle:
+ """Style configuration for visual separators."""
+ task: str = "=" * 50 # Task separator (longer)
+ phase: str = "-" * 30 # Phase separator (medium)
+ error: str = "*" * 40 # Error separator (distinct)
+
+class TaskLogger:
+ """Advanced logger for task context and state tracking."""
+
+ def __init__(
+ self,
+ task_id: str,
+ goal: str,
+ color_scheme: Optional[ColorScheme] = None,
+ separator_style: Optional[SeparatorStyle] = None,
+ use_separators: bool = True
+ ):
+ self.context = TaskContext(
+ id=task_id,
+ goal=goal,
+ current_step=StepInfo(
+ number=1,
+ description="Task initialized",
+ started_at=datetime.utcnow().isoformat(),
+ status=TaskStatus.PENDING
+ ),
+ browser_state=BrowserState(
+ url="",
+ page_ready=False,
+ dynamic_content_loaded=False,
+ visible_elements=0
+ ),
+ retries=RetryInfo()
+ )
+ self._step_start_time: Optional[datetime] = None
+ self.colors = color_scheme or ColorScheme()
+ self.separators = separator_style or SeparatorStyle()
+ self.use_separators = use_separators
+
+ # Add initial task separator and goal
+ if self.use_separators:
+ self._add_separator("task")
+ self._add_log_entry(f"TASK GOAL: {goal}")
+
+ def start_phase(self, phase_name: str) -> None:
+ """Start a new phase in the task."""
+ if self.use_separators:
+ self._add_separator("phase")
+ self._add_log_entry(f"PHASE: {phase_name}")
+
+ def _add_separator(self, separator_type: Literal["task", "phase", "error"]) -> None:
+ """Add a visual separator to the log history."""
+ if not self.use_separators:
+ return
+
+ separator = getattr(self.separators, separator_type)
+ colored_separator = self.colors.apply(
+ separator,
+ self.colors.info if separator_type != "error" else self.colors.error
+ )
+ self._add_log_entry(colored_separator)
+
+ def _add_log_entry(self, entry: str) -> None:
+ """Add a raw log entry to the history."""
+ step = StepInfo(
+ number=self.context.current_step.number,
+ description=entry,
+ started_at=datetime.utcnow().isoformat(),
+ status=TaskStatus.RUNNING
+ )
+ self.context.log_history.append(step)
+
+ def update_step(self,
+ description: str,
+ status: TaskStatus,
+ progress: Optional[float] = None,
+ action_type: Optional[ActionType] = None,
+ context: Optional[Dict[str, Any]] = None,
+ results: Optional[Dict[str, Any]] = None,
+ suppress_similar: bool = False) -> None:
+ """Update the current step information."""
+ step_duration = None
+ if self._step_start_time:
+ step_duration = (datetime.utcnow() - self._step_start_time).total_seconds()
+
+ new_step = StepInfo(
+ number=self.context.current_step.number + 1,
+ description=description,
+ started_at=datetime.utcnow().isoformat(),
+ status=status,
+ duration=step_duration,
+ progress=progress,
+ action_type=action_type,
+ context=context,
+ results=results,
+ suppress_similar=suppress_similar
+ )
+
+ # Check if we should suppress this step
+ if not suppress_similar or not self._is_similar_to_previous(new_step):
+ self.context.log_history.append(new_step)
+ self.context.current_step = new_step
+ self._step_start_time = datetime.utcnow()
+ else:
+ # Update the previous step with new status/results
+ prev_step = self.context.log_history[-1]
+ prev_step.status = status
+ if results:
+ prev_step.results = results
+ # Update current step to reflect changes
+ self.context.current_step = prev_step
+
+ def _is_similar_to_previous(self, step: StepInfo) -> bool:
+ """Check if a step is similar to the previous one."""
+ if not self.context.log_history:
+ return False
+ prev_step = self.context.log_history[-1]
+ return (
+ prev_step.action_type == step.action_type and
+ prev_step.description.split()[0] == step.description.split()[0] # Compare first word
+ )
+
+ def get_log_history(self) -> List[str]:
+ """Get the formatted history of log entries."""
+ return [self._format_step(step) for step in self.context.log_history]
+
+ def _format_step(self, step: StepInfo) -> str:
+ """Format a step as a log entry with colors."""
+ timestamp = datetime.fromisoformat(step.started_at).strftime("%Y-%m-%d %H:%M:%S")
+ duration = f"({step.duration:.1f}s)" if step.duration is not None else ""
+
+ # Color-coded status symbols
+ if isinstance(step.status, TaskStatus):
+ status_symbol = {
+ TaskStatus.COMPLETE: self.colors.apply("✓", self.colors.success),
+ TaskStatus.FAILED: self.colors.apply("×", self.colors.error),
+ TaskStatus.RUNNING: self.colors.apply("→", self.colors.info),
+ TaskStatus.PENDING: self.colors.apply("→", self.colors.info)
+ }.get(step.status, self.colors.apply("→", self.colors.info))
+ else:
+ status_symbol = self.colors.apply("→", self.colors.info)
+
+ # Color-coded action emoji
+ action_emoji = step.action_type.emoji if step.action_type else ""
+ if action_emoji:
+ action_emoji = self.colors.apply(action_emoji, self.colors.info)
+
+ # Format step number with info color
+ step_number = self.colors.apply(f"STEP {step.number}/?", self.colors.info)
+
+ return f"[{timestamp}] {action_emoji} {step_number} {step.description} {status_symbol} {duration}"
+
+ def format_log_entry(self) -> str:
+ """Format the current state as a log entry."""
+ return self._format_step(self.context.current_step)
+
+ def update_browser_state(self,
+ url: Optional[str] = None,
+ page_ready: Optional[bool] = None,
+ dynamic_content_loaded: Optional[bool] = None,
+ visible_elements: Optional[int] = None,
+ current_frame: Optional[str] = None,
+ active_element: Optional[str] = None,
+ page_title: Optional[str] = None) -> None:
+ """Update the browser state information."""
+ if url is not None:
+ self.context.browser_state.url = url
+ if page_ready is not None:
+ self.context.browser_state.page_ready = page_ready
+ if dynamic_content_loaded is not None:
+ self.context.browser_state.dynamic_content_loaded = dynamic_content_loaded
+ if visible_elements is not None:
+ self.context.browser_state.visible_elements = visible_elements
+ if current_frame is not None:
+ self.context.browser_state.current_frame = current_frame
+ if active_element is not None:
+ self.context.browser_state.active_element = active_element
+ if page_title is not None:
+ self.context.browser_state.page_title = page_title
+
+ def log_error(self, error: Exception, step_number: int, action: str) -> None:
+ """Log an error with context."""
+ if self.use_separators:
+ self._add_separator("error")
+
+ self.context.error = ErrorInfo(
+ type=error.__class__.__name__,
+ message=str(error),
+ step=step_number,
+ action=action,
+ traceback=traceback.format_exc()
+ )
+ self.context.current_step.status = TaskStatus.FAILED
+
+ if self.use_separators:
+ self._add_separator("error")
+
+ def start_performance_tracking(self) -> None:
+ """Start tracking performance metrics."""
+ self._step_start_time = datetime.utcnow()
+
+ def track_step_duration(self, step_type: str, duration: float) -> None:
+ """Track the duration of a specific step type."""
+ if self.context.performance is not None:
+ self.context.performance.add_step_duration(step_type, duration)
+
+ def get_performance_metrics(self) -> Dict[str, Any]:
+ """Get the current performance metrics."""
+ if self.context.performance is not None:
+ return self.context.performance.to_dict()
+ return {"total_duration": 0.0, "step_breakdown": {}}
+
+ def get_context(self) -> Dict[str, Any]:
+ """Get the current context as a dictionary."""
+ return self.context.to_dict()
+
+ def log_state(self) -> None:
+ """Log the current state."""
+ state = self.get_context()
+ print(json.dumps(state, indent=2))
+
+ async def execute_with_retry(
+ self,
+ operation: Callable[[], Awaitable[T]],
+ operation_name: str,
+ retry_config: Optional[RetryConfig] = None
+ ) -> T:
+ """Execute an operation with retry logic."""
+ if retry_config is None:
+ retry_config = RetryConfig()
+
+ attempt = 0
+ last_error = None
+
+ while True:
+ try:
+ # Calculate and apply delay if this is a retry
+ delay = retry_config.get_delay(attempt)
+ if delay == -1: # Max retries exceeded
+ if last_error:
+ raise last_error
+ raise Exception("Max retries exceeded")
+
+ if delay > 0:
+ await asyncio.sleep(delay)
+
+ # Attempt the operation
+ result = await operation()
+
+ # Update retry info on success
+ if self.context.retries is not None:
+ self.context.retries.attempts = attempt + 1
+ self.context.retries.success = True
+
+ return result
+
+ except Exception as e:
+ last_error = e
+ attempt += 1
+
+ # Log the retry attempt
+ if self.context.retries is not None:
+ self.context.retries.history.append({
+ "attempt": attempt,
+ "timestamp": datetime.utcnow().isoformat(),
+ "error": f"{e.__class__.__name__}: {str(e)}",
+ "delay": retry_config.get_delay(attempt)
+ })
+
+ # Update the error context
+ self.log_error(e, self.context.current_step.number, operation_name)
+
+ # Continue if we haven't exceeded max retries
+ if attempt <= retry_config.max_retries:
+ self.update_step(
+ f"Retrying {operation_name} (attempt {attempt + 1}/{retry_config.max_retries + 1})",
+ TaskStatus.RUNNING
+ )
+ continue
+
+ # Max retries exceeded
+ if self.context.retries is not None:
+ self.context.retries.attempts = attempt
+ self.context.retries.success = False
+ raise
\ No newline at end of file
diff --git a/src/utils/utils.py b/src/utils/utils.py
index 3ab38977..9e202fa6 100644
--- a/src/utils/utils.py
+++ b/src/utils/utils.py
@@ -9,6 +9,7 @@
import time
from pathlib import Path
from typing import Dict, Optional
+from pydantic import SecretStr
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI
@@ -30,15 +31,17 @@ def get_llm_model(provider: str, **kwargs):
base_url = kwargs.get("base_url")
if not kwargs.get("api_key", ""):
- api_key = os.getenv("ANTHROPIC_API_KEY", "")
+ api_key = SecretStr(os.getenv("ANTHROPIC_API_KEY") or "")
else:
- api_key = kwargs.get("api_key")
+ api_key = SecretStr(kwargs.get("api_key") or "")
return ChatAnthropic(
model_name=kwargs.get("model_name", "claude-3-5-sonnet-20240620"),
temperature=kwargs.get("temperature", 0.0),
base_url=base_url,
api_key=api_key,
+ timeout=kwargs.get("timeout", 60),
+ stop=kwargs.get("stop", None)
)
elif provider == "openai":
if not kwargs.get("base_url", ""):
@@ -47,15 +50,16 @@ def get_llm_model(provider: str, **kwargs):
base_url = kwargs.get("base_url")
if not kwargs.get("api_key", ""):
- api_key = os.getenv("OPENAI_API_KEY", "")
+ api_key = SecretStr(os.getenv("OPENAI_API_KEY") or "")
else:
- api_key = kwargs.get("api_key")
+ api_key = SecretStr(kwargs.get("api_key") or "")
return ChatOpenAI(
- model=kwargs.get("model_name", "gpt-4o"),
+ model=kwargs.get("model_name", "gpt-4"),
temperature=kwargs.get("temperature", 0.0),
base_url=base_url,
api_key=api_key,
+ timeout=kwargs.get("timeout", 60),
)
elif provider == "deepseek":
if not kwargs.get("base_url", ""):
@@ -64,25 +68,36 @@ def get_llm_model(provider: str, **kwargs):
base_url = kwargs.get("base_url")
if not kwargs.get("api_key", ""):
- api_key = os.getenv("DEEPSEEK_API_KEY", "")
+ api_key = SecretStr(os.getenv("DEEPSEEK_API_KEY") or "")
else:
- api_key = kwargs.get("api_key")
+ api_key = SecretStr(kwargs.get("api_key") or "")
return ChatOpenAI(
model=kwargs.get("model_name", "deepseek-chat"),
temperature=kwargs.get("temperature", 0.0),
base_url=base_url,
api_key=api_key,
+ timeout=kwargs.get("timeout", 60),
)
elif provider == "gemini":
if not kwargs.get("api_key", ""):
- api_key = os.getenv("GOOGLE_API_KEY", "")
+ api_key = SecretStr(os.getenv("GOOGLE_API_KEY") or "")
else:
- api_key = kwargs.get("api_key")
+ api_key = SecretStr(kwargs.get("api_key") or "")
+
+ # Get model name from environment or kwargs
+ model_name = kwargs.get("model_name")
+ if not model_name:
+ if kwargs.get("vision"):
+ model_name = os.getenv("GOOGLE_API_MODEL", "gemini-1.5-flash")
+ else:
+ model_name = os.getenv("GOOGLE_API_TYPE", "gemini-1.5-flash")
+
return ChatGoogleGenerativeAI(
- model=kwargs.get("model_name", "gemini-2.0-flash-exp"),
+ model=model_name,
temperature=kwargs.get("temperature", 0.0),
- google_api_key=api_key,
+ api_key=api_key,
+ timeout=kwargs.get("timeout", 60)
)
elif provider == "ollama":
return ChatOllama(
@@ -97,27 +112,28 @@ def get_llm_model(provider: str, **kwargs):
else:
base_url = kwargs.get("base_url")
if not kwargs.get("api_key", ""):
- api_key = os.getenv("AZURE_OPENAI_API_KEY", "")
+ api_key = SecretStr(os.getenv("AZURE_OPENAI_API_KEY") or "")
else:
- api_key = kwargs.get("api_key")
+ api_key = SecretStr(kwargs.get("api_key") or "")
return AzureChatOpenAI(
model=kwargs.get("model_name", "gpt-4o"),
temperature=kwargs.get("temperature", 0.0),
api_version="2024-05-01-preview",
azure_endpoint=base_url,
api_key=api_key,
+ timeout=kwargs.get("timeout", 60),
)
else:
raise ValueError(f"Unsupported provider: {provider}")
# Predefined model names for common providers
model_names = {
- "anthropic": ["claude-3-5-sonnet-20240620", "claude-3-opus-20240229"],
- "openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo"],
+ "anthropic": ["claude-3-5-sonnet-latest", "claude-3-5-sonnet-20241022"],
+ "openai": ["gpt-4o"],
"deepseek": ["deepseek-chat"],
- "gemini": ["gemini-2.0-flash-exp", "gemini-2.0-flash-thinking-exp", "gemini-1.5-flash-latest", "gemini-1.5-flash-8b-latest", "gemini-2.0-flash-thinking-exp-1219" ],
+ "gemini": ["gemini-1.5-pro", "gemini-2.0-flash"],
"ollama": ["qwen2.5:7b", "llama2:7b"],
- "azure_openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo"]
+ "azure_openai": ["gpt-4", "gpt-3.5-turbo"]
}
# Callback to update the model name dropdown based on the selected provider
diff --git a/test_gemini_connection.py b/test_gemini_connection.py
new file mode 100644
index 00000000..0feeecad
--- /dev/null
+++ b/test_gemini_connection.py
@@ -0,0 +1,47 @@
+import google.generativeai as genai
+import os
+from dotenv import load_dotenv, find_dotenv
+
+# Force reload of environment variables
+load_dotenv(find_dotenv(), override=True)
+
+api_key = os.environ.get("GOOGLE_API_KEY")
+model_name = os.environ.get("GOOGLE_API_MODEL")
+
+if not api_key or not model_name:
+ raise ValueError("Missing required environment variables: GOOGLE_API_KEY or GOOGLE_API_MODEL")
+
+print(f"Using model: {model_name}")
+genai.configure(api_key=api_key, transport="rest")
+
+# List all available models
+print("\nAvailable models:")
+for m in genai.list_models():
+ print(f"- {m.name}")
+
+# Check that the model exists in the client
+found_model = False
+for m in genai.list_models():
+ model_id = m.name.replace("models/", "")
+ if model_id == model_name:
+ found_model = True
+ print(f"\nFound model: {m.name}")
+ break
+
+if not found_model:
+ print("\nAvailable model IDs:")
+ for m in genai.list_models():
+ print(f"- {m.name.replace('models/', '')}")
+
+assert found_model, f"Model not found: {model_name}"
+
+# Load the model
+model = genai.GenerativeModel(model_name)
+
+# Perform a simple generation task
+try:
+ response = model.generate_content("Hello, I'm testing the Gemini API connection. Please respond with a short greeting.")
+ print(f"\nResponse: {response.text}")
+except Exception as e:
+ print(f"\nError generating content: {e}")
+ raise
\ No newline at end of file
diff --git a/test_results.txt b/test_results.txt
new file mode 100644
index 00000000..86a61db1
--- /dev/null
+++ b/test_results.txt
@@ -0,0 +1,125 @@
+============================= test session starts ==============================
+platform darwin -- Python 3.11.9, pytest-8.3.4, pluggy-1.5.0 -- /Users/dmieloch/Dev/experiments/web-ui/venv/bin/python
+cachedir: .pytest_cache
+rootdir: /Users/dmieloch/Dev/experiments/web-ui
+configfile: pytest.ini
+plugins: cov-6.0.0, asyncio-0.25.2, anyio-4.8.0, timeout-2.3.1
+asyncio: mode=Mode.AUTO, asyncio_default_fixture_loop_scope=function
+collecting ...
+----------------------------- live log collection ------------------------------
+INFO root:service.py:51 Anonymized telemetry enabled. See https://github.com/gregpr07/browser-use for more information.
+INFO httpx:_client.py:1038 HTTP Request: GET https://api.gradio.app/gradio-messaging/en "HTTP/1.1 200 OK"
+collected 133 items
+
+tests/test_browser_cli.py::TestBrowserInitialization::test_basic_initialization
+-------------------------------- live log setup --------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:28 Cleanup start - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:39 Globals and environment reset before test
+PASSED [ 1/133]
+------------------------------ live log teardown -------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:45 Cleanup finally - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:65 Globals and environment reset after test
+
+tests/test_browser_cli.py::TestBrowserInitialization::test_window_size
+-------------------------------- live log setup --------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:28 Cleanup start - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:39 Globals and environment reset before test
+FAILED [ 2/133]
+------------------------------ live log teardown -------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:45 Cleanup finally - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:65 Globals and environment reset after test
+
+tests/test_browser_cli.py::TestBrowserInitialization::test_headless_mode
+-------------------------------- live log setup --------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:28 Cleanup start - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:39 Globals and environment reset before test
+FAILED [ 3/133]
+------------------------------ live log teardown -------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:45 Cleanup finally - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:65 Globals and environment reset after test
+
+tests/test_browser_cli.py::TestBrowserInitialization::test_user_data_dir
+-------------------------------- live log setup --------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:28 Cleanup start - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:39 Globals and environment reset before test
+PASSED [ 4/133]
+------------------------------ live log teardown -------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:45 Cleanup finally - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:65 Globals and environment reset after test
+
+tests/test_browser_cli.py::TestBrowserInitialization::test_proxy_configuration
+-------------------------------- live log setup --------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:28 Cleanup start - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:39 Globals and environment reset before test
+PASSED [ 5/133]
+------------------------------ live log teardown -------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:45 Cleanup finally - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:65 Globals and environment reset after test
+
+tests/test_browser_cli.py::TestBrowserInitialization::test_disable_security
+-------------------------------- live log setup --------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:28 Cleanup start - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:39 Globals and environment reset before test
+FAILED [ 6/133]
+------------------------------ live log teardown -------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:45 Cleanup finally - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:65 Globals and environment reset after test
+
+tests/test_browser_cli.py::TestBrowserInitialization::test_multiple_initialization
+-------------------------------- live log setup --------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:28 Cleanup start - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:39 Globals and environment reset before test
+FAILED [ 7/133]
+------------------------------ live log teardown -------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:45 Cleanup finally - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:65 Globals and environment reset after test
+
+tests/test_browser_cli.py::TestBrowserTasks::test_model_switching
+-------------------------------- live log setup --------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:28 Cleanup start - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:39 Globals and environment reset before test
+FAILED [ 8/133]
+------------------------------ live log teardown -------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:45 Cleanup finally - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:65 Globals and environment reset after test
+
+tests/test_browser_cli.py::TestBrowserTasks::test_vision_capability
+-------------------------------- live log setup --------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:28 Cleanup start - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:39 Globals and environment reset before test
+FAILED [ 9/133]
+------------------------------ live log teardown -------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:45 Cleanup finally - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:65 Globals and environment reset after test
+
+tests/test_browser_cli.py::TestBrowserTasks::test_recording
+-------------------------------- live log setup --------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:28 Cleanup start - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:39 Globals and environment reset before test
+-------------------------------- live log call ---------------------------------
+INFO src.agent.custom_agent:custom_agent.py:438 🚀 Starting task: go to example.com
+INFO src.agent.custom_agent:custom_agent.py:222 Starting step 1
+INFO httpx:_client.py:1786 HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 422 Unprocessable Entity"
+INFO httpx:_client.py:1038 HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"
+INFO src.agent.custom_agent:custom_agent.py:139 Model Response: failed
+INFO src.agent.custom_agent:logging.py:96 Batch: 1 action events
+INFO browser_use.controller.service:service.py:59 🔗 Navigated to https://example.com
+INFO src.agent.custom_agent:custom_agent.py:313 Step 2 completed
+INFO src.agent.custom_agent:custom_agent.py:222 Starting step 2
+INFO httpx:_client.py:1786 HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 422 Unprocessable Entity"
+INFO httpx:_client.py:1038 HTTP Request: POST https://api.deepseek.com/v1/chat/completions "HTTP/1.1 200 OK"
+INFO src.agent.custom_agent:custom_agent.py:139 Model Response: success
+INFO src.agent.custom_agent:logging.py:96 Batch: 1 action events
+INFO src.agent.custom_agent:custom_agent.py:260 Task completed
+INFO src.agent.custom_agent:custom_agent.py:313 Step 3 completed
+INFO src.agent.custom_agent:custom_agent.py:481 ✅ Task completed successfully
+WARNING src.agent.custom_agent:custom_agent.py:342 No history or first screenshot to create GIF from
+PASSED [ 10/133]
+------------------------------ live log teardown -------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:45 Cleanup finally - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:65 Globals and environment reset after test
+
+tests/test_browser_cli.py::TestBrowserTasks::test_tracing
+-------------------------------- live log setup --------------------------------
+INFO tests.test_browser_cli:test_browser_cli.py:28 Cleanup start - Browser state: False
+INFO tests.test_browser_cli:test_browser_cli.py:39 Globals and environment reset before test
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 00000000..c74cef78
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1,7 @@
+"""
+Test suite for the browser-use project.
+
+This package contains tests for:
+- Browser automation (CLI, core functionality, Playwright)
+- API integration (endpoints, LLM integration)
+"""
\ No newline at end of file
diff --git a/tests/requirements-test.txt b/tests/requirements-test.txt
new file mode 100644
index 00000000..bef705f9
--- /dev/null
+++ b/tests/requirements-test.txt
@@ -0,0 +1,3 @@
+pytest>=7.0.0
+pytest-asyncio>=0.21.0
+pytest-cov>=4.0.0
\ No newline at end of file
diff --git a/tests/test_api.py b/tests/test_api.py
new file mode 100644
index 00000000..5dc4fae7
--- /dev/null
+++ b/tests/test_api.py
@@ -0,0 +1,73 @@
+import asyncio
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.context import BrowserContextConfig, BrowserContextWindowSize
+from browser_use.agent.service import Agent
+from src.utils import utils
+from src.controller.custom_controller import CustomController
+from src.agent.custom_agent import CustomAgent
+from src.agent.custom_prompts import CustomSystemPrompt
+import os
+
+async def main():
+ window_w, window_h = 1920, 1080
+
+ # Initialize the browser
+ browser = Browser(
+ config=BrowserConfig(
+ headless=False,
+ disable_security=True,
+ extra_chromium_args=[f"--window-size={window_w},{window_h}"],
+ )
+ )
+
+ # Create a browser context
+ async with await browser.new_context(
+ config=BrowserContextConfig(
+ trace_path="./tmp/traces",
+ save_recording_path="./tmp/record_videos",
+ no_viewport=False,
+ browser_window_size=BrowserContextWindowSize(
+ width=window_w, height=window_h
+ ),
+ )
+ ) as browser_context:
+ # Initialize the controller
+ controller = CustomController()
+
+ # Initialize the agent with a simple task using CustomAgent
+ agent = CustomAgent(
+ task="go to google.com and search for 'OpenAI'",
+ add_infos="", # hints for the LLM if needed
+ llm=utils.get_llm_model(
+ provider="deepseek",
+ model_name="deepseek-chat", # Using V2.5 via deepseek-chat endpoint
+ temperature=0.8,
+ base_url="https://api.deepseek.com/v1",
+ api_key=os.getenv("DEEPSEEK_API_KEY", "")
+ ),
+ browser=browser,
+ browser_context=browser_context,
+ controller=controller,
+ system_prompt_class=CustomSystemPrompt,
+ use_vision=False, # Must be False for DeepSeek
+ tool_call_in_content=True, # Required for DeepSeek as per test files
+ max_actions_per_step=1 # Control granularity of actions
+ )
+
+ # Run the agent
+ history = await agent.run(max_steps=10)
+
+ print("Final Result:")
+ print(history.final_result())
+
+ print("\nErrors:")
+ print(history.errors())
+
+ print("\nModel Actions:")
+ print(history.model_actions())
+
+ print("\nThoughts:")
+ print(history.model_thoughts())
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/tests/test_browser_cli.py b/tests/test_browser_cli.py
new file mode 100644
index 00000000..7974ea62
--- /dev/null
+++ b/tests/test_browser_cli.py
@@ -0,0 +1,591 @@
+import sys
+from pathlib import Path
+import tempfile
+import logging
+from io import StringIO
+import contextlib
+
+# Add project root to Python path
+PROJECT_ROOT = Path(__file__).parent.parent
+sys.path.insert(0, str(PROJECT_ROOT))
+
+import pytest
+import asyncio
+import os
+from cli.browser_use_cli import initialize_browser, run_browser_task, close_browser, main, _global_browser, _global_browser_context
+from src.utils.utils import model_names # Import model names from utils
+
+# Configure logging for tests
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# Reset global state before each test
+@pytest.fixture(autouse=True)
+async def cleanup():
+ """Ensure proper cleanup of browser and event loop between tests"""
+ global _global_browser, _global_browser_context
+
+ logger.info(f"Cleanup start - Browser state: {_global_browser is not None}")
+
+ # Reset globals and environment before test
+ if _global_browser is not None:
+ await close_browser()
+ logger.info("Browser closed")
+
+ _global_browser = None
+ _global_browser_context = None
+ os.environ["BROWSER_USE_RUNNING"] = "false"
+
+ logger.info("Globals and environment reset before test")
+
+ try:
+ yield
+ finally:
+ try:
+ logger.info(f"Cleanup finally - Browser state: {_global_browser is not None}")
+ if _global_browser is not None:
+ await close_browser()
+ logger.info("Browser closed")
+ # Clean up any remaining event loop resources
+ loop = asyncio.get_event_loop()
+ tasks = [t for t in asyncio.all_tasks(loop=loop) if not t.done()]
+ if tasks:
+ logger.info(f"Found {len(tasks)} pending tasks")
+ for task in tasks:
+ task.cancel()
+ await asyncio.gather(*tasks, return_exceptions=True)
+ logger.info("Pending tasks cancelled")
+ except Exception as e:
+ logger.error(f"Error during cleanup: {e}")
+ raise
+ finally:
+ _global_browser = None
+ _global_browser_context = None
+ os.environ["BROWSER_USE_RUNNING"] = "false"
+ logger.info("Globals and environment reset after test")
+
+class TestBrowserInitialization:
+ """Test browser launch-time options"""
+
+ async def test_basic_initialization(self):
+ """Test basic browser initialization with defaults"""
+ success = await initialize_browser()
+ assert success is True
+
+ async def test_window_size(self):
+ """Test custom window size"""
+ success = await initialize_browser(window_size=(800, 600))
+ assert success is True
+
+ # Create a simple HTML page that displays window size
+ result = await run_browser_task(
+ "go to data:text/html,",
+ model="deepseek-chat"
+ )
+ assert result is not None and "800" in result.lower() and "600" in result.lower()
+
+ async def test_headless_mode(self):
+ """Test headless mode"""
+ success = await initialize_browser(headless=True)
+ assert success is True
+ # Verify we can still run tasks
+ result = await run_browser_task(
+ "go to example.com and tell me the title",
+ model="deepseek-chat"
+ )
+ assert result is not None and "example" in result.lower()
+
+ async def test_user_data_dir(self, tmp_path):
+ """Test custom user data directory"""
+ user_data = tmp_path / "chrome_data"
+ user_data.mkdir()
+ success = await initialize_browser(user_data_dir=str(user_data))
+ assert success is True
+ assert user_data.exists()
+
+ async def test_proxy_configuration(self):
+ """Test proxy configuration"""
+ # Using a test proxy - in practice you'd use a real proxy server
+ test_proxy = "localhost:8080"
+ success = await initialize_browser(proxy=test_proxy)
+ assert success is True
+
+ @pytest.mark.timeout(30) # Add 30 second timeout
+ async def test_disable_security(self):
+ """Test security disable option"""
+ success = await initialize_browser(disable_security=True)
+ assert success is True
+ # Try accessing a cross-origin resource that would normally be blocked
+ result = await run_browser_task(
+ "go to a test page and try to access cross-origin content",
+ model="deepseek-chat",
+ max_steps=5 # Limit steps to prevent timeout
+ )
+ assert result is not None and "error" not in result.lower()
+
+ async def test_multiple_initialization(self):
+ """Test that second initialization fails while browser is running"""
+ success1 = await initialize_browser()
+ assert success1 is True
+ success2 = await initialize_browser()
+ assert success2 is False
+
+class TestBrowserTasks:
+ """Test runtime task options"""
+
+ @pytest.fixture(autouse=True)
+ async def setup_browser(self):
+ """Start browser before each test"""
+ await initialize_browser()
+ yield
+
+ @pytest.fixture
+ def local_test_page(self):
+ """Create a local HTML file for testing"""
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
+ f.write("""
+
+
+
This is a test paragraph with specific content.
+ + + + + """) + return f.name + + async def test_model_switching(self): + """Test switching between different LLM models""" + # Test DeepSeek - Note: 422 errors are expected but don't affect functionality + try: + result1 = await run_browser_task( + "go to example.com and summarize the page", + model="deepseek-chat" + ) + assert result1 is not None + except Exception as e: + if "422" not in str(e): # Only ignore 422 errors + raise + + # Test Gemini + os.environ["GOOGLE_API_MODEL"] = model_names["gemini"][0] # Set model via environment + result2 = await run_browser_task( + "what do you see on the page?", + model="gemini", + vision=True + ) + assert result2 is not None and len(result2) > 0 + assert result1 is not None and len(result1) > 0 + assert result1 != result2 # Different models should give different responses + + async def test_vision_capability(self): + """Test vision capabilities""" + # Set Gemini model via environment + os.environ["GOOGLE_API_MODEL"] = model_names["gemini"][0] + + # Without vision + result1 = await run_browser_task( + "what do you see on example.com?", + model="gemini", + vision=False + ) + + # With vision + result2 = await run_browser_task( + "what do you see on example.com?", + model="gemini", + vision=True + ) + + assert result1 is not None and result2 is not None and len(result2) > len(result1) # Vision should provide more details + + async def test_recording(self, tmp_path): + """Test session recording""" + record_path = tmp_path / "recordings" + record_path.mkdir() + + await run_browser_task( + "go to example.com", + record=True, + record_path=str(record_path) + ) + + # Check that recording file was created + recordings = list(record_path.glob("*.webm")) + assert len(recordings) > 0 + + async def test_tracing(self, tmp_path): + """Test debug tracing""" + trace_path = tmp_path / "traces" + trace_path.mkdir() + + await run_browser_task( + "go to example.com", + trace_path=str(trace_path) + ) + + # Wait a bit for the trace file to be written + await asyncio.sleep(1) + + # Check that trace file was created + traces = list(trace_path.glob("*.zip")) + assert len(traces) > 0 + + async def test_max_steps_limit(self): + """Test max steps limitation""" + with pytest.raises(Exception): + # This task would normally take more than 2 steps + await run_browser_task( + "go to google.com, search for 'OpenAI', click first result", + max_steps=2 + ) + + async def test_max_actions_limit(self): + """Test max actions per step limitation""" + with pytest.raises(Exception): + # This would require multiple actions in one step + await run_browser_task( + "go to google.com and click all search results", + max_actions=1 + ) + + async def test_additional_context(self): + """Test providing additional context""" + result = await run_browser_task( + "summarize the content", + add_info="Focus on technical details and pricing information" + ) + assert result is not None and ("technical" in result.lower() or "pricing" in result.lower()) + + async def test_report_generation(self, local_test_page): + """Test that the agent can analyze a page and return a report""" + logger.info("Starting report generation test") + + # Check initial state + logger.info(f"Initial browser state: {_global_browser is not None}") + + # Initialize browser + success = await initialize_browser() + logger.info(f"Browser initialization result: {success}") + + assert success is True, "Browser initialization failed" + + # Create the task prompt + prompt = f"Go to file://{local_test_page} and create a report about the page structure, including any interactive elements found" + + try: + result = await run_browser_task( + prompt=prompt, + model="deepseek-chat", + max_steps=3 + ) + + logger.info(f"Received report: {result}") + + # Verify the report contains expected information + assert result is not None + assert "Test Content" in result + assert "button" in result.lower() + assert "paragraph" in result.lower() + + logger.info("Report verification successful") + + except Exception as e: + logger.error(f"Error during report generation: {e}") + raise + finally: + # Cleanup + os.unlink(local_test_page) + logger.info("Test cleanup completed") + +class TestBrowserLifecycle: + """Test browser lifecycle management""" + + async def test_close_and_reopen(self): + """Test closing and reopening browser""" + # First session + success1 = await initialize_browser() + assert success1 is True + result1 = await run_browser_task("go to example.com") + assert result1 is not None + await close_browser() + + # Second session + success2 = await initialize_browser() + assert success2 is True + result2 = await run_browser_task("go to example.com") + assert result2 is not None + + async def test_error_handling(self): + """Test error handling in various scenarios""" + # Test running task without browser + with pytest.raises(Exception): + await run_browser_task("this should fail") + + # Test closing already closed browser + await close_browser() + await close_browser() # Should not raise error + + # Test recovery after error + success = await initialize_browser() + assert success is True + result = await run_browser_task("go to example.com") + assert result is not None + +class TestCLICommands: + """Comprehensive tests for CLI command functionality""" + + @pytest.fixture(autouse=True) + def setup_cli(self): + """Setup and cleanup for CLI tests""" + # Store original argv and stdout + self.original_argv = sys.argv.copy() + self.original_stdout = sys.stdout + + # Create StringIO buffer and redirect stdout + self.output = StringIO() + sys.stdout = self.output + + yield + + # Restore original argv and stdout + sys.argv = self.original_argv + sys.stdout = self.original_stdout + + # Close the StringIO buffer + self.output.close() + + def test_start_command_basic(self): + """Test basic browser start command""" + # Ensure output buffer is empty + self.output.truncate(0) + self.output.seek(0) + + sys.argv = ["browser-use", "start"] + with contextlib.redirect_stdout(self.output): + main() + output = self.output.getvalue() + assert "Browser session started successfully" in output + + def test_start_command_with_options(self): + """Test browser start with various options""" + # Ensure output buffer is empty + self.output.truncate(0) + self.output.seek(0) + + sys.argv = [ + "browser-use", "start", + "--window-size", "800x600", + "--headless", + "--disable-security" + ] + with contextlib.redirect_stdout(self.output): + main() + output = self.output.getvalue() + assert "Browser session started successfully" in output + + def test_run_command_basic(self): + """Test basic run command""" + # First start the browser + self.output.truncate(0) + self.output.seek(0) + sys.argv = ["browser-use", "start"] + with contextlib.redirect_stdout(self.output): + main() + + # Then run a task + self.output.truncate(0) + self.output.seek(0) + sys.argv = [ + "browser-use", "run", + "go to example.com", + "--model", "deepseek-chat" + ] + with contextlib.redirect_stdout(self.output): + main() + output = self.output.getvalue() + assert len(output) > 0 + + def test_run_command_with_options(self): + """Test run command with various options""" + # First start the browser + self.output.truncate(0) + self.output.seek(0) + sys.argv = ["browser-use", "start"] + with contextlib.redirect_stdout(self.output): + main() + + # Then run a task with multiple options + self.output.truncate(0) + self.output.seek(0) + sys.argv = [ + "browser-use", "run", + "go to example.com", + "--model", "gemini", + "--vision", + "--max-steps", "5", + "--max-actions", "2", + "--add-info", "Focus on the main content" + ] + with contextlib.redirect_stdout(self.output): + main() + output = self.output.getvalue() + assert len(output) > 0 + + def test_close_command(self): + """Test browser close command""" + # First start the browser + self.output.truncate(0) + self.output.seek(0) + sys.argv = ["browser-use", "start"] + with contextlib.redirect_stdout(self.output): + main() + + # Then close it + self.output.truncate(0) + self.output.seek(0) + sys.argv = ["browser-use", "close"] + with contextlib.redirect_stdout(self.output): + main() + output = self.output.getvalue() + assert "Browser session closed" in output + + def test_invalid_command(self): + """Test handling of invalid commands""" + self.output.truncate(0) + self.output.seek(0) + sys.argv = ["browser-use", "invalid-command"] + with pytest.raises(SystemExit): + with contextlib.redirect_stdout(self.output): + main() + + def test_missing_required_args(self): + """Test handling of missing required arguments""" + self.output.truncate(0) + self.output.seek(0) + sys.argv = ["browser-use", "run"] # Missing prompt + with pytest.raises(SystemExit): + with contextlib.redirect_stdout(self.output): + main() + + def test_invalid_window_size(self): + """Test handling of invalid window size format""" + self.output.truncate(0) + self.output.seek(0) + sys.argv = ["browser-use", "start", "--window-size", "invalid"] + with contextlib.redirect_stdout(self.output): + main() # Should use default size + output = self.output.getvalue() + assert "Browser session started successfully" in output + + def test_recording_options(self): + """Test recording functionality via CLI""" + with tempfile.TemporaryDirectory() as tmp_dir: + # First start the browser + self.output.truncate(0) + self.output.seek(0) + sys.argv = ["browser-use", "start"] + with contextlib.redirect_stdout(self.output): + main() + + # Then run with recording + self.output.truncate(0) + self.output.seek(0) + sys.argv = [ + "browser-use", "run", + "go to example.com", + "--record", + "--record-path", tmp_dir + ] + with contextlib.redirect_stdout(self.output): + main() + recordings = list(Path(tmp_dir).glob("*.webm")) + assert len(recordings) > 0 + + def test_tracing_options(self): + """Test tracing functionality via CLI""" + with tempfile.TemporaryDirectory() as tmp_dir: + # First start the browser + self.output.truncate(0) + self.output.seek(0) + sys.argv = ["browser-use", "start"] + with contextlib.redirect_stdout(self.output): + main() + + # Then run with tracing + self.output.truncate(0) + self.output.seek(0) + sys.argv = [ + "browser-use", "run", + "go to example.com", + "--trace-path", tmp_dir + ] + with contextlib.redirect_stdout(self.output): + main() + traces = list(Path(tmp_dir).glob("*.zip")) + assert len(traces) > 0 + + def test_model_switching_cli(self): + """Test switching between different models via CLI""" + # First start the browser + self.output.truncate(0) + self.output.seek(0) + sys.argv = ["browser-use", "start"] + with contextlib.redirect_stdout(self.output): + main() + + # Test with DeepSeek + self.output.truncate(0) + self.output.seek(0) + sys.argv = [ + "browser-use", "run", + "go to example.com", + "--model", "deepseek-chat" + ] + with contextlib.redirect_stdout(self.output): + main() + deepseek_output = self.output.getvalue() + + # Close browser to clean up event loop + self.output.truncate(0) + self.output.seek(0) + sys.argv = ["browser-use", "close"] + with contextlib.redirect_stdout(self.output): + main() + + # Start new browser for Gemini test + self.output.truncate(0) + self.output.seek(0) + sys.argv = ["browser-use", "start"] + with contextlib.redirect_stdout(self.output): + main() + + # Test with Gemini + self.output.truncate(0) + self.output.seek(0) + os.environ["GOOGLE_API_MODEL"] = model_names["gemini"][0] + sys.argv = [ + "browser-use", "run", + "go to example.com", + "--model", "gemini", + "--vision" + ] + with contextlib.redirect_stdout(self.output): + main() + gemini_output = self.output.getvalue() + + # Close browser + self.output.truncate(0) + self.output.seek(0) + sys.argv = ["browser-use", "close"] + with contextlib.redirect_stdout(self.output): + main() + + assert len(deepseek_output) > 0 + assert len(gemini_output) > 0 + assert deepseek_output != gemini_output \ No newline at end of file diff --git a/tests/test_browser_controller.py b/tests/test_browser_controller.py new file mode 100644 index 00000000..409d8d33 --- /dev/null +++ b/tests/test_browser_controller.py @@ -0,0 +1,125 @@ +import pytest +from unittest.mock import AsyncMock, patch, MagicMock +import asyncio +from src.utils.browser_controller import BrowserController + +@pytest.fixture +async def browser_controller(): + controller = BrowserController() + yield controller + await controller.cleanup() + +@pytest.mark.asyncio +async def test_single_initialization(browser_controller): + mock_browser = AsyncMock() + mock_playwright = AsyncMock() + mock_playwright.chromium.launch = AsyncMock(return_value=mock_browser) + + with patch('src.utils.browser_controller.async_playwright', + return_value=AsyncMock(start=AsyncMock(return_value=mock_playwright))) as mock_async_playwright: + await browser_controller.initialize() + assert browser_controller.init_count == 1 + assert browser_controller.browser == mock_browser + + # Verify progress events + progress_history = browser_controller.logger.get_progress_history() + assert len(progress_history) >= 2 # At least start and complete events + assert progress_history[0]["status"] == "starting" + assert progress_history[-1]["status"] == "completed" + assert progress_history[-1]["progress"] == 1.0 + + # Second initialization should not create new browser + await browser_controller.initialize() + assert browser_controller.init_count == 1 + mock_async_playwright.assert_called_once() + +@pytest.mark.asyncio +async def test_concurrent_initialization(browser_controller): + mock_browser = AsyncMock() + mock_playwright = AsyncMock() + mock_playwright.chromium.launch = AsyncMock(return_value=mock_browser) + + with patch('src.utils.browser_controller.async_playwright', + return_value=AsyncMock(start=AsyncMock(return_value=mock_playwright))): + # Start multiple concurrent initializations + tasks = [browser_controller.initialize() for _ in range(3)] + await asyncio.gather(*tasks) + + # Should only initialize once + assert browser_controller.init_count == 1 + assert browser_controller.browser == mock_browser + + # Verify browser events + browser_events = browser_controller.logger.get_browser_events() + launch_events = [e for e in browser_events if e["event_type"] == "browser_launched"] + assert len(launch_events) == 1 + +@pytest.mark.asyncio +async def test_browser_launch_options(browser_controller): + mock_browser = AsyncMock() + mock_playwright = AsyncMock() + mock_playwright.chromium.launch = AsyncMock(return_value=mock_browser) + + with patch('src.utils.browser_controller.async_playwright', + return_value=AsyncMock(start=AsyncMock(return_value=mock_playwright))) as mock_async_playwright: + await browser_controller.initialize() + + # Verify launch options + mock_playwright.chromium.launch.assert_called_once_with( + headless=True, + args=['--no-sandbox'] + ) + + # Verify browser events + browser_events = browser_controller.logger.get_browser_events() + launch_event = next(e for e in browser_events if e["event_type"] == "browser_launched") + assert launch_event["details"]["headless"] is True + +@pytest.mark.asyncio +async def test_initialization_failure(browser_controller): + mock_playwright = AsyncMock() + mock_playwright.chromium.launch = AsyncMock(side_effect=Exception("Browser launch failed")) + + with patch('src.utils.browser_controller.async_playwright', + return_value=AsyncMock(start=AsyncMock(return_value=mock_playwright))), \ + pytest.raises(Exception, match="Browser launch failed"): + await browser_controller.initialize() + + assert browser_controller.browser is None + assert browser_controller.init_count == 0 + + # Verify error events + browser_events = browser_controller.logger.get_browser_events() + error_event = next(e for e in browser_events if e["event_type"] == "launch_failed") + assert "Browser launch failed" in error_event["details"]["error"] + + # Verify progress events show failure + progress_events = browser_controller.logger.get_progress_history() + final_event = progress_events[-1] + assert final_event["status"] == "failed" + assert final_event["progress"] == 0.0 + +@pytest.mark.asyncio +async def test_browser_cleanup(browser_controller): + mock_browser = AsyncMock() + mock_playwright = AsyncMock() + mock_playwright.chromium.launch = AsyncMock(return_value=mock_browser) + + with patch('src.utils.browser_controller.async_playwright', + return_value=AsyncMock(start=AsyncMock(return_value=mock_playwright))): + await browser_controller.initialize() + assert browser_controller.browser is not None + + await browser_controller.cleanup() + mock_browser.close.assert_called_once() + mock_playwright.stop.assert_called_once() + assert browser_controller.browser is None + assert browser_controller._playwright is None + + # Verify cleanup events + progress_events = browser_controller.logger.get_progress_history() + cleanup_events = [e for e in progress_events if e["step"] == "cleanup"] + assert len(cleanup_events) >= 2 # At least start and complete events + assert cleanup_events[0]["status"] == "starting" + assert cleanup_events[-1]["status"] == "completed" + assert cleanup_events[-1]["progress"] == 1.0 \ No newline at end of file diff --git a/tests/test_browser_use_cli.py b/tests/test_browser_use_cli.py new file mode 100644 index 00000000..1506d019 --- /dev/null +++ b/tests/test_browser_use_cli.py @@ -0,0 +1,117 @@ +import pytest +import asyncio +from pathlib import Path +from urllib.parse import urlparse +from cli.browser_use_cli import run_browser_task, initialize_browser, close_browser + +@pytest.fixture +async def browser_session(): + """Fixture to manage browser session for tests""" + await initialize_browser(headless=True) + yield + await close_browser() + +@pytest.mark.asyncio +async def test_url_validation(): + """Test URL validation in run_browser_task""" + # Test invalid URLs + invalid_urls = [ + "not-a-url", + "http://", + "https://", + "ftp://example.com", # non-http(s) protocol + "", + None + ] + + for url in invalid_urls: + result = await run_browser_task( + prompt="test task", + url=url, + provider="Deepseek", + headless=True + ) + assert "Invalid URL provided" in result + + # Test valid URLs + valid_urls = [ + "https://example.com", + "http://localhost:8080", + "https://prompt-forge.replit.app/" + ] + + for url in valid_urls: + result = await run_browser_task( + prompt="test task", + url=url, + provider="Deepseek", + headless=True + ) + assert "Invalid URL provided" not in result + +@pytest.mark.asyncio +async def test_url_navigation(browser_session): + """Test that the browser actually navigates to the provided URL""" + url = "https://example.com" + result = await run_browser_task( + prompt="verify the page title contains 'Example'", + url=url, + provider="Deepseek", + headless=True, + max_steps=3 + ) + assert "success" in result.lower() or "verified" in result.lower() + +@pytest.mark.asyncio +async def test_url_in_prompt(): + """Test that the URL is correctly prepended to the task prompt""" + url = "https://example.com" + test_prompt = "click the button" + result = await run_browser_task( + prompt=test_prompt, + url=url, + provider="Deepseek", + headless=True + ) + + # The result should indicate navigation happened first + assert "navigated" in result.lower() or "loaded" in result.lower() + +@pytest.mark.asyncio +async def test_multiple_tasks_same_url(browser_session): + """Test running multiple tasks with the same starting URL""" + url = "https://example.com" + tasks = [ + "verify the page has loaded", + "check if there are any links on the page", + "look for a search box" + ] + + for task in tasks: + result = await run_browser_task( + prompt=task, + url=url, + provider="Deepseek", + headless=True, + max_steps=3 + ) + assert result is not None + assert isinstance(result, str) + +@pytest.mark.asyncio +async def test_url_with_different_providers(): + """Test URL handling with different providers""" + url = "https://example.com" + providers = ["Deepseek", "Google", "Anthropic"] + + for provider in providers: + result = await run_browser_task( + prompt="verify the page has loaded", + url=url, + provider=provider, + headless=True, + max_steps=3 + ) + assert result is not None + assert isinstance(result, str) + assert "Invalid URL provided" not in result \ No newline at end of file diff --git a/tests/test_browser_vision.py b/tests/test_browser_vision.py new file mode 100644 index 00000000..75b44c32 --- /dev/null +++ b/tests/test_browser_vision.py @@ -0,0 +1,94 @@ +import os +import pytest +from dotenv import load_dotenv +from src.utils import utils +from cli.browser_use_cli import run_browser_task + +# Load environment variables +load_dotenv() + +@pytest.mark.asyncio +class TestBrowserVision: + """Test browser automation with vision capabilities""" + + async def setup_method(self): + """Setup test environment""" + self.api_key = os.getenv("OPENAI_API_KEY") + if not self.api_key: + pytest.skip("OPENAI_API_KEY not set") + + async def test_vision_analysis_task(self): + """Test visual analysis of a webpage""" + result = await run_browser_task( + prompt="go to https://example.com and describe the visual layout of the page", + provider="OpenAI", + vision=True, + headless=True, # Run headless for CI/CD + record=True, # Record for debugging + record_path="./tmp/test_recordings" + ) + assert result is not None + assert "layout" in result.lower() or "design" in result.lower() + + async def test_vision_interaction_task(self): + """Test visual-guided interaction""" + result = await run_browser_task( + prompt="go to https://example.com and click on the most prominent link on the page", + provider="OpenAI", + vision=True, + headless=True, + record=True, + record_path="./tmp/test_recordings" + ) + assert result is not None + assert "clicked" in result.lower() or "selected" in result.lower() + + async def test_vision_verification_task(self): + """Test visual verification of page state""" + result = await run_browser_task( + prompt="go to https://example.com and verify that the main heading is visible and centered", + provider="OpenAI", + vision=True, + headless=True, + record=True, + record_path="./tmp/test_recordings" + ) + assert result is not None + assert "heading" in result.lower() and ("visible" in result.lower() or "centered" in result.lower()) + + async def test_vision_error_handling(self): + """Test error handling with vision tasks""" + # Test with a non-existent page to verify error handling + result = await run_browser_task( + prompt="go to https://nonexistent.example.com and describe what you see", + provider="OpenAI", + vision=True, + headless=True, + record=True, + record_path="./tmp/test_recordings" + ) + assert result is not None + assert "error" in result.lower() or "unable" in result.lower() or "failed" in result.lower() + + async def test_vision_with_different_models(self): + """Test vision capabilities with different providers""" + test_configs = [ + "OpenAI", # Will use gpt-4o + "Google", # Will use gemini-pro + "Anthropic" # Will use claude-3-5-sonnet-20241022 + ] + + for provider in test_configs: + result = await run_browser_task( + prompt="go to https://example.com and describe the page layout", + provider=provider, + vision=True, + headless=True, + record=True, + record_path=f"./tmp/test_recordings/{provider.lower()}" + ) + assert result is not None + assert len(result) > 0, f"Failed with provider {provider}" + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) \ No newline at end of file diff --git a/tests/test_enhanced_tracing.py b/tests/test_enhanced_tracing.py new file mode 100644 index 00000000..74b4d344 --- /dev/null +++ b/tests/test_enhanced_tracing.py @@ -0,0 +1,894 @@ +import pytest +import asyncio +import json +import zipfile +from pathlib import Path +import tempfile +from src.trace_analyzer import PlaywrightTrace, analyze_trace, EnhancedTraceAnalyzer + +# Sample enhanced trace data with new features +SAMPLE_ENHANCED_TRACE = { + "action_context": { + "before_state": { + "element": "#login-button", + "visible": True, + "enabled": True, + "text": "Log In" + }, + "after_state": { + "element": "#login-button", + "visible": True, + "enabled": True, + "clicked": True + }, + "interactive_elements": [ + { + "selector": "#login-button", + "confidence": 0.95, + "chosen": True, + "reason": "Primary login button with highest visibility" + }, + { + "selector": "#signup-button", + "confidence": 0.45, + "chosen": False, + "reason": "Not relevant for login action" + } + ], + "element_state_before": { + "visible": True, + "computed_styles": { + "pointer-events": "auto", + "opacity": "1", + "z-index": "100" + }, + "focus_state": "not-focused", + "accessibility": { + "aria-hidden": "false", + "aria-disabled": "false" + } + }, + "element_state_after": { + "visible": True, + "focus_state": "focused", + "triggered_events": ["click", "focus"], + "accessibility": { + "aria-hidden": "false", + "aria-disabled": "false" + } + } + }, + "decision_trail": { + "reasoning": [ + "Identified login form as primary authentication method", + "Located login button with high confidence", + "Verified button is enabled and visible" + ], + "alternatives": [ + { + "action": "click signup button", + "rejected_reason": "Not aligned with login task" + } + ], + "influential_features": ["button text", "aria-label", "position"], + "confidence_threshold": 0.8, + "attention_weights": { + "element_text": 0.6, + "aria_label": 0.3, + "position": 0.1 + }, + "alternative_paths": [ + { + "action": "click hamburger menu", + "rejected_reason": "settings directly visible", + "confidence": 0.4 + } + ] + }, + "element_identification": { + "selectors": { + "xpath": "//button[@id='login-button']", + "css": "#login-button", + "aria": "button[aria-label='Login']", + "text": "button:has-text('Log In')" + }, + "visual_position": { + "x": 100, + "y": 200, + "width": 80, + "height": 40 + }, + "relationships": { + "parent": "form#login-form", + "siblings": ["#username-input", "#password-input"] + }, + "relative_position": { + "from_top_nav": "20px from right", + "from_viewport": "top-right quadrant", + "nearest_landmarks": [ + {"element": "button.new-template", "distance": "40px left"}, + {"element": "div.user-menu", "distance": "60px right"} + ] + }, + "hierarchy": { + "parent": "nav.top-bar", + "siblings": ["button.new-template", "button.help"], + "children": ["span.icon", "span.text"] + } + }, + "failure_analysis": { + "state": "Element found but not clickable", + "attempts": [ + { + "strategy": "wait for visibility", + "outcome": "success", + "duration": 500 + } + ], + "dom_changes": [ + { + "timestamp": 1000, + "change": "overlay-removed" + } + ], + "dom_mutations": [ + { + "timestamp": "T+200ms", + "type": "attribute_change", + "element": "#settings-modal", + "attribute": "aria-hidden", + "old_value": "true", + "new_value": "false" + } + ], + "network_state": { + "requests_in_flight": 2, + "last_completed_request": "/api/settings", + "pending_requests": [ + { + "url": "/api/user/preferences", + "method": "GET", + "duration_so_far": "150ms" + } + ] + } + }, + "session_context": { + "url": "https://example.com/login", + "route_changes": [ + { + "from": "/", + "to": "/login", + "timestamp": 900 + } + ], + "network_requests": [ + { + "url": "/api/auth", + "method": "POST", + "status": 200 + } + ], + "viewport": { + "width": 1920, + "height": 1080, + "device_pixel_ratio": 2, + "orientation": "landscape" + }, + "performance_metrics": { + "memory_usage": "120MB", + "dom_node_count": 1250, + "frame_rate": "60fps", + "resource_timing": { + "dns_lookup": "10ms", + "connection": "50ms", + "ttfb": "200ms" + } + }, + "browser_state": { + "cookies_enabled": True, + "javascript_enabled": True, + "local_storage_used": "2.5MB", + "active_service_workers": 2 + } + }, + "recovery_info": { + "checkpoints": [ + { + "state": "pre-login", + "timestamp": 800, + "restorable": True + } + ], + "alternative_selectors": [ + "#login-button", + "button[aria-label='Login']" + ], + "state_restoration": { + "checkpoints": [ + { + "timestamp": "T+0", + "state": "initial_load", + "restorable": True, + "snapshot": { + "url": "https://example.com/login", + "scroll_position": {"x": 0, "y": 0}, + "form_data": {"username": "test", "password": "****"} + } + }, + { + "timestamp": "T+1500ms", + "state": "settings_clicked", + "restorable": True, + "snapshot": { + "url": "https://example.com/settings", + "modal_open": True, + "selected_tab": "general" + } + } + ] + }, + "fallback_sequences": [ + { + "condition": "settings_button_not_visible", + "actions": [ + { + "step": "check_viewport_scroll", + "max_attempts": 3, + "delay_between_attempts": "500ms" + }, + { + "step": "check_hamburger_menu", + "required_elements": ["button.menu", "div.dropdown"] + }, + { + "step": "refresh_page", + "clear_cache": True + } + ], + "success_criteria": { + "element_visible": True, + "element_clickable": True, + "no_overlays": True + } + } + ] + }, + "model_data": { + "input_tokens": 512, + "output_tokens": 128, + "vision_analysis": { + "button_detected": True, + "confidence": 0.98 + } + }, + "temporal_context": { + "action_start": 1000, + "action_complete": 1500, + "wait_conditions": [ + { + "type": "animation", + "duration": 200 + } + ] + }, + "element_reporting": { + "current_step": { + "number": 3, + "description": "Locating settings button", + "context": "Looking for interactive element with icon or label", + "viewport_state": "Fully loaded, no overlays" + }, + "element_selection": { + "chosen_element": { + "selector": "button.settings-icon", + "confidence": 0.95, + "action": "click", + "description": "Settings button in top-right corner" + }, + "alternative_candidates": [ + { + "selector": "div.menu-icon", + "confidence": 0.45, + "rejected_reason": "Not interactive element" + }, + { + "selector": "span.gear-icon", + "confidence": 0.30, + "rejected_reason": "Hidden by overlay" + } + ], + "selection_criteria": [ + "Visibility in viewport", + "Interactive element", + "Icon matching settings/gear pattern" + ] + } + }, + "error_context": { + "session_state": { + "status": "reset_required", + "reason": "No active session found", + "action": "Creating new session with fresh context", + "resolution": "Reinitialize successful" + }, + "recovery_steps": [ + { + "attempt": 1, + "strategy": "clear_session", + "outcome": "success" + }, + { + "attempt": 2, + "strategy": "reinitialize", + "outcome": "success" + } + ] + }, + "timing_analysis": { + "action_breakdown": { + "element_search": "150ms", + "interaction_delay": "50ms", + "animation_duration": "200ms", + "network_wait": "300ms" + }, + "cumulative_timing": { + "total_duration": "700ms", + "user_perceived_latency": "250ms" + }, + "performance_markers": { + "first_paint": "100ms", + "first_contentful_paint": "200ms", + "time_to_interactive": "450ms" + } + }, + "visual_state": { + "screenshot_diffs": { + "before_click": "diff_1.png", + "after_click": "diff_2.png", + "changes_highlighted": True + }, + "element_visibility": { + "before": { + "visible_area_percentage": 100, + "obscured_by": [], + "viewport_position": "center" + }, + "after": { + "visible_area_percentage": 100, + "obscured_by": [], + "viewport_position": "center" + } + }, + "layout_shifts": [ + { + "timestamp": "T+100ms", + "elements_moved": ["#settings-panel", "#main-content"], + "cumulative_layout_shift": 0.1 + } + ] + }, + "error_recovery": { + "retry_strategy": { + "backoff": "exponential", + "max_attempts": 3, + "conditions": { + "network_stable": True, + "animations_complete": True, + "viewport_stable": True + } + }, + "environment_factors": { + "network_conditions": { + "latency": "50ms", + "bandwidth": "10Mbps", + "stability": "stable" + }, + "system_resources": { + "cpu_utilization": "45%", + "memory_available": "2GB", + "gpu_utilization": "30%" + } + }, + "recovery_checkpoints": [ + { + "timestamp": "T+0", + "state": "pre_action", + "snapshot": { + "dom_state": "hash1234", + "scroll_position": {"x": 0, "y": 0} + } + }, + { + "timestamp": "T+500ms", + "state": "post_action", + "snapshot": { + "dom_state": "hash5678", + "scroll_position": {"x": 0, "y": 100} + } + } + ] + } +} + +@pytest.fixture +def enhanced_trace_file(): + """Create a temporary trace file with enhanced sample data.""" + with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as temp_zip: + with zipfile.ZipFile(temp_zip.name, 'w') as zf: + zf.writestr('trace.enhanced', json.dumps(SAMPLE_ENHANCED_TRACE)) + yield temp_zip.name + Path(temp_zip.name).unlink() + +@pytest.mark.asyncio +async def test_action_context_analysis(enhanced_trace_file): + """Test analysis of action context including before/after states.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + context = await analyzer.analyze_action_context() + + assert context["interactive_elements_count"] == 2 + assert context["chosen_element"]["confidence"] > 0.9 + assert len(context["state_changes"]) > 0 + assert "clicked" in context["state_changes"][0]["after"] + +@pytest.mark.asyncio +async def test_decision_trail_analysis(enhanced_trace_file): + """Test analysis of decision making process.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + trail = await analyzer.analyze_decision_trail() + + assert len(trail["reasoning_steps"]) == 3 + assert len(trail["alternative_actions"]) > 0 + assert len(trail["key_features"]) > 0 + +@pytest.mark.asyncio +async def test_element_identification_analysis(enhanced_trace_file): + """Test analysis of element identification methods.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + identification = await analyzer.analyze_element_identification() + + assert len(identification["selectors"]) >= 4 + assert "visual_position" in identification + assert "element_relationships" in identification + +@pytest.mark.asyncio +async def test_failure_analysis(enhanced_trace_file): + """Test analysis of failure scenarios and recovery attempts.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + failure = await analyzer.analyze_failures() + + assert "failure_state" in failure + assert len(failure["recovery_attempts"]) > 0 + assert "dom_mutations" in failure + +@pytest.mark.asyncio +async def test_session_context_analysis(enhanced_trace_file): + """Test analysis of session-wide context.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + session = await analyzer.analyze_session_context() + + assert "current_url" in session + assert len(session["route_history"]) > 0 + assert len(session["network_activity"]) > 0 + +@pytest.mark.asyncio +async def test_recovery_info_analysis(enhanced_trace_file): + """Test analysis of recovery information.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + recovery = await analyzer.analyze_recovery_info() + + assert len(recovery["restore_points"]) > 0 + assert len(recovery["fallback_selectors"]) > 0 + +@pytest.mark.asyncio +async def test_model_data_analysis(enhanced_trace_file): + """Test analysis of model-specific data.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + model_data = await analyzer.analyze_model_data() + + assert "token_usage" in model_data + assert "vision_results" in model_data + assert model_data["token_usage"]["total"] == model_data["token_usage"]["input"] + model_data["token_usage"]["output"] + +@pytest.mark.asyncio +async def test_temporal_context_analysis(enhanced_trace_file): + """Test analysis of temporal information.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + temporal = await analyzer.analyze_temporal_context() + + assert "duration" in temporal + assert len(temporal["wait_events"]) > 0 + assert temporal["duration"] == temporal["end_time"] - temporal["start_time"] + +@pytest.mark.asyncio +async def test_comprehensive_trace_analysis(enhanced_trace_file): + """Test end-to-end analysis of enhanced trace data.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + result = await analyzer.analyze_all() + + # Verify all major components are present + assert "action_context" in result + assert "decision_trail" in result + assert "element_identification" in result + assert "failure_analysis" in result + assert "session_context" in result + assert "recovery_info" in result + assert "model_data" in result + assert "temporal_context" in result + + # Verify relationships between components + assert result["action_context"]["timestamp"] <= result["temporal_context"]["end_time"] + + # Debug prints + print("\nFallback selectors:", result["recovery_info"]["fallback_selectors"]) + print("Element selectors:", result["element_identification"]["selectors"].values()) + + # Verify that at least one selector is in the fallback selectors + assert any(selector in result["recovery_info"]["fallback_selectors"] + for selector in result["element_identification"]["selectors"].values()) + +@pytest.mark.asyncio +async def test_enhanced_element_reporting(enhanced_trace_file): + """Test enhanced element reporting with detailed selection context.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + reporting = await analyzer.analyze_element_reporting() + + # Verify step context + assert reporting["current_step"]["number"] == 3 + assert "description" in reporting["current_step"] + assert "context" in reporting["current_step"] + assert "viewport_state" in reporting["current_step"] + + # Verify element selection details + selection = reporting["element_selection"] + assert selection["chosen_element"]["confidence"] > 0.9 + assert len(selection["alternative_candidates"]) >= 2 + assert len(selection["selection_criteria"]) >= 3 + + # Verify detailed element information + chosen = selection["chosen_element"] + assert "selector" in chosen + assert "description" in chosen + assert "action" in chosen + +@pytest.mark.asyncio +async def test_enhanced_error_context(enhanced_trace_file): + """Test enhanced error context and session state reporting.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + error_context = await analyzer.analyze_error_context() + + # Verify session state information + assert "status" in error_context["session_state"] + assert "reason" in error_context["session_state"] + assert "action" in error_context["session_state"] + assert "resolution" in error_context["session_state"] + + # Verify recovery steps + assert len(error_context["recovery_steps"]) >= 2 + for step in error_context["recovery_steps"]: + assert "attempt" in step + assert "strategy" in step + assert "outcome" in step + +@pytest.mark.asyncio +async def test_comprehensive_analysis_with_enhancements(enhanced_trace_file): + """Test comprehensive analysis including new enhanced features.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + result = await analyzer.analyze_all() + + # Verify new components are present + assert "element_reporting" in result + assert "error_context" in result + + # Verify element reporting structure + reporting = result["element_reporting"] + assert reporting["current_step"]["description"] == "Locating settings button" + assert reporting["element_selection"]["chosen_element"]["selector"] == "button.settings-icon" + + # Verify error context structure + error = result["error_context"] + assert error["session_state"]["status"] == "reset_required" + assert len(error["recovery_steps"]) == 2 + +@pytest.mark.asyncio +async def test_enhanced_action_context_state(enhanced_trace_file): + """Test enhanced action context with detailed element state tracking.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + context = await analyzer.analyze_action_context() + + # Verify element state before action + before_state = context["element_state_before"] + assert before_state["visible"] is True + assert "pointer-events" in before_state["computed_styles"] + assert before_state["focus_state"] == "not-focused" + assert "aria-hidden" in before_state["accessibility"] + + # Verify element state after action + after_state = context["element_state_after"] + assert "focus_state" in after_state + assert len(after_state["triggered_events"]) >= 2 + assert after_state["accessibility"]["aria-hidden"] == "false" + +@pytest.mark.asyncio +async def test_enhanced_decision_trail(enhanced_trace_file): + """Test enhanced decision trail with confidence and attention weights.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + trail = await analyzer.analyze_decision_trail() + + # Verify confidence threshold + assert trail["confidence_threshold"] > 0.7 + + # Verify attention weights + weights = trail["attention_weights"] + assert abs(sum(weights.values()) - 1.0) < 0.01 # Should sum to approximately 1 + assert weights["element_text"] > weights["position"] # Text should have higher weight + + # Verify alternative paths + alternatives = trail["alternative_paths"] + assert len(alternatives) > 0 + assert all("confidence" in path for path in alternatives) + assert all("rejected_reason" in path for path in alternatives) + +@pytest.mark.asyncio +async def test_comprehensive_analysis_with_state_tracking(enhanced_trace_file): + """Test comprehensive analysis including state tracking enhancements.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + result = await analyzer.analyze_all() + + # Verify enhanced action context + context = result["action_context"] + assert "element_state_before" in context + assert "element_state_after" in context + assert "computed_styles" in context["element_state_before"] + + # Verify enhanced decision trail + trail = result["decision_trail"] + assert "confidence_threshold" in trail + assert "attention_weights" in trail + assert "alternative_paths" in trail + +@pytest.mark.asyncio +async def test_enhanced_element_identification(enhanced_trace_file): + """Test enhanced element identification with relative positioning and hierarchy.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + identification = await analyzer.analyze_element_identification() + + # Verify relative positioning + position = identification["relative_position"] + assert "from_top_nav" in position + assert "from_viewport" in position + assert len(position["nearest_landmarks"]) >= 2 + + # Verify element hierarchy + hierarchy = identification["hierarchy"] + assert hierarchy["parent"] == "nav.top-bar" + assert len(hierarchy["siblings"]) >= 2 + assert len(hierarchy["children"]) >= 1 + + # Verify relationships + assert all(isinstance(sibling, str) for sibling in hierarchy["siblings"]) + assert all(isinstance(child, str) for child in hierarchy["children"]) + +@pytest.mark.asyncio +async def test_enhanced_failure_analysis(enhanced_trace_file): + """Test enhanced failure analysis with DOM mutations and network state.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + failure = await analyzer.analyze_failures() + + # Verify DOM mutations + mutations = failure["dom_mutations"] + assert len(mutations) > 0 + mutation = mutations[0] + assert "timestamp" in mutation + assert "type" in mutation + assert "element" in mutation + assert "old_value" in mutation + assert "new_value" in mutation + + # Verify network state + network = failure["network_state"] + assert "requests_in_flight" in network + assert "last_completed_request" in network + assert len(network["pending_requests"]) > 0 + + # Verify request details + pending = network["pending_requests"][0] + assert "url" in pending + assert "method" in pending + assert "duration_so_far" in pending + +@pytest.mark.asyncio +async def test_comprehensive_analysis_with_enhanced_identification(enhanced_trace_file): + """Test comprehensive analysis including enhanced identification features.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + result = await analyzer.analyze_all() + + # Verify enhanced element identification + identification = result["element_identification"] + assert "relative_position" in identification + assert "hierarchy" in identification + assert identification["hierarchy"]["parent"] == "nav.top-bar" + + # Verify enhanced failure analysis + failure = result["failure_analysis"] + assert "dom_mutations" in failure + assert "network_state" in failure + assert failure["network_state"]["requests_in_flight"] > 0 + +@pytest.mark.asyncio +async def test_enhanced_session_context(enhanced_trace_file): + """Test enhanced session context with viewport and performance metrics.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + session = await analyzer.analyze_session_context() + + # Verify viewport information + viewport = session["viewport"] + assert viewport["width"] == 1920 + assert viewport["height"] == 1080 + assert viewport["device_pixel_ratio"] == 2 + assert viewport["orientation"] == "landscape" + + # Verify performance metrics + metrics = session["performance_metrics"] + assert "memory_usage" in metrics + assert "dom_node_count" in metrics + assert "frame_rate" in metrics + assert all(timing in metrics["resource_timing"] for timing in ["dns_lookup", "connection", "ttfb"]) + + # Verify browser state + browser = session["browser_state"] + assert browser["cookies_enabled"] is True + assert browser["javascript_enabled"] is True + assert "local_storage_used" in browser + assert "active_service_workers" in browser + +@pytest.mark.asyncio +async def test_enhanced_recovery_info(enhanced_trace_file): + """Test enhanced recovery information with state restoration and fallback sequences.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + recovery = await analyzer.analyze_recovery_info() + + # Verify state restoration + restoration = recovery["state_restoration"] + assert len(restoration["checkpoints"]) >= 2 + + # Verify checkpoint details + checkpoint = restoration["checkpoints"][0] + assert "timestamp" in checkpoint + assert "state" in checkpoint + assert "restorable" in checkpoint + assert "snapshot" in checkpoint + assert all(key in checkpoint["snapshot"] for key in ["url", "scroll_position"]) + + # Verify fallback sequences + sequences = recovery["fallback_sequences"] + assert len(sequences) > 0 + sequence = sequences[0] + assert "condition" in sequence + assert len(sequence["actions"]) >= 3 + assert "success_criteria" in sequence + + # Verify action details + action = sequence["actions"][0] + assert "step" in action + assert "max_attempts" in action + assert "delay_between_attempts" in action + +@pytest.mark.asyncio +async def test_comprehensive_analysis_with_enriched_context(enhanced_trace_file): + """Test comprehensive analysis including enriched session context and recovery info.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + result = await analyzer.analyze_all() + + # Verify enriched session context + session = result["session_context"] + assert "viewport" in session + assert "performance_metrics" in session + assert "browser_state" in session + assert session["viewport"]["width"] == 1920 + + # Verify enhanced recovery info + recovery = result["recovery_info"] + assert "state_restoration" in recovery + assert "fallback_sequences" in recovery + assert len(recovery["state_restoration"]["checkpoints"]) >= 2 + assert all("success_criteria" in seq for seq in recovery["fallback_sequences"]) + +@pytest.mark.asyncio +async def test_interaction_timing_analysis(enhanced_trace_file): + """Test detailed interaction timing analysis.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + timing = await analyzer.analyze_timing() + + # Verify action breakdown + breakdown = timing["action_breakdown"] + assert "element_search" in breakdown + assert "interaction_delay" in breakdown + assert "animation_duration" in breakdown + assert "network_wait" in breakdown + + # Verify cumulative timing + cumulative = timing["cumulative_timing"] + assert "total_duration" in cumulative + assert "user_perceived_latency" in cumulative + + # Verify performance markers + markers = timing["performance_markers"] + assert all(marker in markers for marker in ["first_paint", "first_contentful_paint", "time_to_interactive"]) + +@pytest.mark.asyncio +async def test_visual_state_tracking(enhanced_trace_file): + """Test visual state tracking and analysis.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + visual = await analyzer.analyze_visual_state() + + # Verify screenshot diffs + diffs = visual["screenshot_diffs"] + assert "before_click" in diffs + assert "after_click" in diffs + assert diffs["changes_highlighted"] is True + + # Verify element visibility + visibility = visual["element_visibility"] + assert "before" in visibility + assert "after" in visibility + assert "visible_area_percentage" in visibility["before"] + assert "viewport_position" in visibility["before"] + + # Verify layout shifts + shifts = visual["layout_shifts"] + assert len(shifts) > 0 + assert "timestamp" in shifts[0] + assert "elements_moved" in shifts[0] + assert "cumulative_layout_shift" in shifts[0] + +@pytest.mark.asyncio +async def test_enhanced_error_recovery(enhanced_trace_file): + """Test enhanced error recovery capabilities.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + recovery = await analyzer.analyze_error_recovery() + + # Verify retry strategy + strategy = recovery["retry_strategy"] + assert strategy["backoff"] == "exponential" + assert strategy["max_attempts"] == 3 + assert all(condition in strategy["conditions"] for condition in ["network_stable", "animations_complete"]) + + # Verify environment factors + env = recovery["environment_factors"] + assert "network_conditions" in env + assert "system_resources" in env + assert all(metric in env["system_resources"] for metric in ["cpu_utilization", "memory_available"]) + + # Verify recovery checkpoints + checkpoints = recovery["recovery_checkpoints"] + assert len(checkpoints) >= 2 + assert all(key in checkpoints[0] for key in ["timestamp", "state", "snapshot"]) + assert "dom_state" in checkpoints[0]["snapshot"] + +@pytest.mark.asyncio +async def test_comprehensive_analysis_with_all_features(enhanced_trace_file): + """Test comprehensive analysis including all enhanced features.""" + analyzer = EnhancedTraceAnalyzer(enhanced_trace_file) + result = await analyzer.analyze_all() + + # Verify new components are present + assert "timing_analysis" in result + assert "visual_state" in result + assert "error_recovery" in result + + # Verify timing analysis + timing = result["timing_analysis"] + assert "action_breakdown" in timing + assert "cumulative_timing" in timing + + # Verify visual state + visual = result["visual_state"] + assert "screenshot_diffs" in visual + assert "element_visibility" in visual + + # Verify error recovery + recovery = result["error_recovery"] + assert "retry_strategy" in recovery + assert "environment_factors" in recovery + assert recovery["retry_strategy"]["backoff"] == "exponential" \ No newline at end of file diff --git a/tests/test_error_handling.py b/tests/test_error_handling.py new file mode 100644 index 00000000..653f0683 --- /dev/null +++ b/tests/test_error_handling.py @@ -0,0 +1,98 @@ +import pytest +from datetime import datetime +from typing import Dict, Any, Optional +import asyncio +from src.utils.error_handling import ErrorHandler, MaxRetriesExceededError + +class TestErrorHandler: + @pytest.fixture + def handler(self): + return ErrorHandler() + + @pytest.mark.asyncio + async def test_max_retries_exceeded(self, handler): + operation = "test_operation" + error = ValueError("Test error") + + # Should handle first three attempts + for _ in range(3): + await handler.handle_error(error, operation) + + # Fourth attempt should raise MaxRetriesExceededError + with pytest.raises(MaxRetriesExceededError) as exc_info: + await handler.handle_error(error, operation) + + assert exc_info.value.operation == operation + assert exc_info.value.original_error == error + + @pytest.mark.asyncio + async def test_error_logging(self, handler): + operation = "test_operation" + error = ValueError("Test error") + + # First attempt + await handler.handle_error(error, operation) + + # Get the last logged error + last_error = handler.get_last_error() + assert last_error["operation"] == operation + assert last_error["attempt"] == 1 + assert "timestamp" in last_error + assert last_error["error"]["name"] == "ValueError" + assert last_error["error"]["message"] == "Test error" + + @pytest.mark.asyncio + async def test_exponential_backoff(self, handler): + operation = "test_operation" + error = ValueError("Test error") + + # Record start time + start = datetime.now() + + # First attempt (should delay 1 second) + await handler.handle_error(error, operation) + + # Second attempt (should delay 2 seconds) + await handler.handle_error(error, operation) + + # Calculate duration + duration = (datetime.now() - start).total_seconds() + + # Should have waited at least 3 seconds (1 + 2) + assert duration >= 3 + + @pytest.mark.asyncio + async def test_error_code_extraction(self, handler): + # Test with connection error + error = ConnectionError("ERR_CONNECTION_REFUSED: Failed to connect") + code = handler.extract_error_code(error) + assert code == "ERR_CONNECTION_REFUSED" + + # Test with DNS error + error = Exception("ERR_NAME_NOT_RESOLVED: Could not resolve hostname") + code = handler.extract_error_code(error) + assert code == "ERR_NAME_NOT_RESOLVED" + + # Test with unknown error + error = ValueError("Some other error") + code = handler.extract_error_code(error) + assert code == "UNKNOWN_ERROR" + + @pytest.mark.asyncio + async def test_concurrent_retries(self, handler): + operation = "test_operation" + error = ValueError("Test error") + + # Try to handle the same error concurrently + tasks = [ + handler.handle_error(error, operation), + handler.handle_error(error, operation), + handler.handle_error(error, operation) + ] + + # Should complete without raising an error + await asyncio.gather(*tasks, return_exceptions=True) + + # Fourth attempt should still raise MaxRetriesExceededError + with pytest.raises(MaxRetriesExceededError): + await handler.handle_error(error, operation) \ No newline at end of file diff --git a/tests/test_llm_api.py b/tests/test_llm_api.py index 9e2a1d6d..5b29cb3d 100644 --- a/tests/test_llm_api.py +++ b/tests/test_llm_api.py @@ -6,6 +6,7 @@ # @FileName: test_llm_api.py import os import pdb +import pytest from dotenv import load_dotenv @@ -20,12 +21,16 @@ def test_openai_model(): from langchain_core.messages import HumanMessage from src.utils import utils + api_key = os.getenv("OPENAI_API_KEY") + if not api_key: + pytest.skip("OPENAI_API_KEY not set") + llm = utils.get_llm_model( provider="openai", model_name="gpt-4o", temperature=0.8, base_url=os.getenv("OPENAI_ENDPOINT", ""), - api_key=os.getenv("OPENAI_API_KEY", "") + api_key=api_key ) image_path = "assets/examples/test.png" image_data = utils.encode_image(image_path) @@ -47,11 +52,15 @@ def test_gemini_model(): from langchain_core.messages import HumanMessage from src.utils import utils + api_key = os.getenv("GOOGLE_API_KEY") + if not api_key: + pytest.skip("GOOGLE_API_KEY not set") + llm = utils.get_llm_model( provider="gemini", - model_name="gemini-2.0-flash-exp", + model_name="gemini-1.5-pro", temperature=0.8, - api_key=os.getenv("GOOGLE_API_KEY", "") + api_key=api_key ) image_path = "assets/examples/test.png" @@ -73,12 +82,17 @@ def test_azure_openai_model(): from langchain_core.messages import HumanMessage from src.utils import utils + api_key = os.getenv("AZURE_OPENAI_API_KEY") + endpoint = os.getenv("AZURE_OPENAI_ENDPOINT") + if not api_key or not endpoint: + pytest.skip("AZURE_OPENAI_API_KEY or AZURE_OPENAI_ENDPOINT not set") + llm = utils.get_llm_model( provider="azure_openai", - model_name="gpt-4o", + model_name="gpt-4", temperature=0.8, - base_url=os.getenv("AZURE_OPENAI_ENDPOINT", ""), - api_key=os.getenv("AZURE_OPENAI_API_KEY", "") + base_url=endpoint, + api_key=api_key ) image_path = "assets/examples/test.png" image_data = utils.encode_image(image_path) @@ -99,12 +113,39 @@ def test_deepseek_model(): from langchain_core.messages import HumanMessage from src.utils import utils + api_key = os.getenv("DEEPSEEK_API_KEY") + if not api_key: + pytest.skip("DEEPSEEK_API_KEY not set") + llm = utils.get_llm_model( provider="deepseek", model_name="deepseek-chat", temperature=0.8, base_url=os.getenv("DEEPSEEK_ENDPOINT", ""), - api_key=os.getenv("DEEPSEEK_API_KEY", "") + api_key=api_key + ) + message = HumanMessage( + content=[ + {"type": "text", "text": "who are you?"} + ] + ) + ai_msg = llm.invoke([message]) + print(ai_msg.content) + + +def test_anthropic_model(): + from langchain_core.messages import HumanMessage + from src.utils import utils + + api_key = os.getenv("ANTHROPIC_API_KEY") + if not api_key: + pytest.skip("ANTHROPIC_API_KEY not set") + + llm = utils.get_llm_model( + provider="anthropic", + model_name="claude-3-5-sonnet-latest", + temperature=0.8, + api_key=api_key ) message = HumanMessage( content=[ @@ -118,6 +159,16 @@ def test_deepseek_model(): def test_ollama_model(): from langchain_ollama import ChatOllama + # Check if Ollama is running by trying to connect to its default port + import socket + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + try: + result = sock.connect_ex(('localhost', 11434)) + if result != 0: + pytest.skip("Ollama server not running on localhost:11434") + finally: + sock.close() + llm = ChatOllama(model="qwen2.5:7b") ai_msg = llm.invoke("Sing a ballad of LangChain.") print(ai_msg.content) @@ -128,4 +179,5 @@ def test_ollama_model(): # test_gemini_model() # test_azure_openai_model() # test_deepseek_model() + # test_anthropic_model() test_ollama_model() diff --git a/tests/test_llm_integration.py b/tests/test_llm_integration.py new file mode 100644 index 00000000..60dc0056 --- /dev/null +++ b/tests/test_llm_integration.py @@ -0,0 +1,182 @@ +import os +import pytest +from dotenv import load_dotenv +from langchain_core.messages import HumanMessage +from src.utils import utils + +# Load environment variables +load_dotenv() + +class TestOpenAIIntegration: + """Test OpenAI model integration and vision capabilities""" + + def setup_method(self): + """Setup test environment""" + # Ensure required environment variables are set + self.api_key = os.getenv("OPENAI_API_KEY") + self.base_url = os.getenv("OPENAI_ENDPOINT", "https://api.openai.com/v1") + if not self.api_key: + pytest.skip("OPENAI_API_KEY not set") + + def test_gpt4_turbo_initialization(self): + """Test GPT-4 Turbo model initialization""" + llm = utils.get_llm_model( + provider="openai", + model_name="gpt-4o", + temperature=0.8, + base_url=self.base_url, + api_key=self.api_key + ) + assert llm is not None + + def test_gpt4_vision_initialization(self): + """Test GPT-4 Vision model initialization""" + llm = utils.get_llm_model( + provider="openai", + model_name="gpt-4o", + temperature=0.8, + base_url=self.base_url, + api_key=self.api_key, + vision=True + ) + assert llm is not None + + @pytest.mark.asyncio + async def test_vision_capability(self): + """Test vision capability with an example image""" + llm = utils.get_llm_model( + provider="openai", + model_name="gpt-4o", + temperature=0.8, + base_url=self.base_url, + api_key=self.api_key, + vision=True + ) + + # Use a test image + image_path = "assets/examples/test.png" + if not os.path.exists(image_path): + pytest.skip(f"Test image not found at {image_path}") + + image_data = utils.encode_image(image_path) + message = HumanMessage( + content=[ + {"type": "text", "text": "describe this image"}, + { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}, + }, + ] + ) + response = await llm.ainvoke([message]) + assert response is not None + assert isinstance(response.content, str) + assert len(response.content) > 0 + +class TestAzureOpenAIIntegration: + """Test Azure OpenAI integration""" + + def setup_method(self): + """Setup test environment""" + self.api_key = os.getenv("AZURE_OPENAI_API_KEY") + self.endpoint = os.getenv("AZURE_OPENAI_ENDPOINT") + if not self.api_key or not self.endpoint: + pytest.skip("Azure OpenAI credentials not set") + + def test_azure_model_initialization(self): + """Test Azure OpenAI model initialization""" + llm = utils.get_llm_model( + provider="azure_openai", + model_name="gpt-4", + temperature=0.8, + base_url=self.endpoint, + api_key=self.api_key + ) + assert llm is not None + + @pytest.mark.asyncio + async def test_azure_basic_completion(self): + """Test basic completion with Azure OpenAI""" + llm = utils.get_llm_model( + provider="azure_openai", + model_name="gpt-4", + temperature=0.8, + base_url=self.endpoint, + api_key=self.api_key + ) + + message = HumanMessage(content="Say hello!") + response = await llm.ainvoke([message]) + assert response is not None + assert isinstance(response.content, str) + assert len(response.content) > 0 + +class TestAnthropicIntegration: + """Test Anthropic model integration""" + + def setup_method(self): + """Setup test environment""" + self.api_key = os.getenv("ANTHROPIC_API_KEY") + if not self.api_key: + pytest.skip("ANTHROPIC_API_KEY not set") + + def test_claude_initialization(self): + """Test Claude model initialization""" + llm = utils.get_llm_model( + provider="anthropic", + model_name="claude-3-5-sonnet-latest", + temperature=0.8, + api_key=self.api_key + ) + assert llm is not None + + @pytest.mark.asyncio + async def test_basic_completion(self): + """Test basic completion with Claude""" + llm = utils.get_llm_model( + provider="anthropic", + model_name="claude-3-5-sonnet-latest", + temperature=0.8, + api_key=self.api_key + ) + + message = HumanMessage(content="Say hello!") + response = await llm.ainvoke([message]) + assert response is not None + assert isinstance(response.content, str) + assert len(response.content) > 0 + +def test_model_names_consistency(): + """Test that model names are consistent between toolchain and utils""" + # Test OpenAI models + openai_models = utils.model_names["openai"] + expected_openai = ["gpt-4o"] + assert all(model in openai_models for model in expected_openai), "Missing expected OpenAI models" + + # Test Gemini models + gemini_models = utils.model_names["gemini"] + expected_gemini = ["gemini-1.5-pro", "gemini-2.0-flash"] + assert all(model in gemini_models for model in expected_gemini), "Missing expected Gemini models" + + # Test Anthropic models + anthropic_models = utils.model_names["anthropic"] + expected_anthropic = ["claude-3-5-sonnet-latest", "claude-3-5-sonnet-20241022"] + assert all(model in anthropic_models for model in expected_anthropic), "Missing expected Anthropic models" + + # Test DeepSeek models + deepseek_models = utils.model_names["deepseek"] + expected_deepseek = ["deepseek-chat"] + assert all(model in deepseek_models for model in expected_deepseek), "Missing expected DeepSeek models" + + # Test Azure OpenAI models + azure_models = utils.model_names["azure_openai"] + expected_azure = ["gpt-4", "gpt-3.5-turbo"] + assert all(model in azure_models for model in expected_azure), "Missing expected Azure OpenAI models" + + # Test Ollama models + ollama_models = utils.model_names["ollama"] + expected_ollama = ["qwen2.5:7b", "llama2:7b"] + assert all(model in ollama_models for model in expected_ollama), "Missing expected Ollama models" + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) \ No newline at end of file diff --git a/tests/test_logging.py b/tests/test_logging.py new file mode 100644 index 00000000..5e871e0a --- /dev/null +++ b/tests/test_logging.py @@ -0,0 +1,216 @@ +import json +import logging +import datetime +import pytest +from io import StringIO +from typing import Dict, Any +from src.utils.logging import ( + LogFormatter, + BatchedEventLogger, + setup_logging, + PRODUCTION_EXCLUDE_PATTERNS, + LogLevel +) +import sys + +class TestLogFormatter: + @pytest.fixture + def json_formatter(self): + return LogFormatter(use_json=True) + + @pytest.fixture + def compact_formatter(self): + return LogFormatter(use_json=False) + + def test_json_format_basic_log(self, json_formatter): + record = logging.LogRecord( + name="test_logger", + level=logging.INFO, + pathname="test.py", + lineno=1, + msg="Test message", + args=(), + exc_info=None + ) + + formatted = json_formatter.format(record) + parsed = json.loads(formatted) + + assert parsed["level"] == "INFO" + assert parsed["logger"] == "test_logger" + assert parsed["message"] == "Test message" + assert "timestamp" in parsed + + def test_json_format_with_extra_fields(self, json_formatter): + record = logging.LogRecord( + name="test_logger", + level=logging.INFO, + pathname="test.py", + lineno=1, + msg="Test message", + args=(), + exc_info=None + ) + record.event_type = "test_event" + record.event_data = {"key": "value"} + + formatted = json_formatter.format(record) + parsed = json.loads(formatted) + + assert parsed["event_type"] == "test_event" + assert parsed["data"] == {"key": "value"} + + def test_json_format_with_error(self, json_formatter): + try: + raise ValueError("Test error") + except ValueError as e: + record = logging.LogRecord( + name="test_logger", + level=logging.ERROR, + pathname="test.py", + lineno=1, + msg="Error occurred", + args=(), + exc_info=sys.exc_info() + ) + + formatted = json_formatter.format(record) + parsed = json.loads(formatted) + + assert parsed["error"]["type"] == "ValueError" + assert parsed["error"]["message"] == "Test error" + assert "stack_trace" in parsed["error"] + + def test_compact_format_basic_log(self, compact_formatter): + record = logging.LogRecord( + name="test_logger", + level=logging.INFO, + pathname="test.py", + lineno=1, + msg="Test message", + args=(), + exc_info=None + ) + + formatted = compact_formatter.format(record) + assert "] I: Test message" in formatted + + def test_compact_format_with_error(self, compact_formatter): + try: + raise ValueError("Test error") + except ValueError as e: + record = logging.LogRecord( + name="test_logger", + level=logging.ERROR, + pathname="test.py", + lineno=1, + msg="Error occurred", + args=(), + exc_info=sys.exc_info() + ) + + formatted = compact_formatter.format(record) + assert "] E: Error occurred" in formatted + assert "ValueError: Test error" in formatted + +class TestBatchedEventLogger: + @pytest.fixture + def string_io(self): + return StringIO() + + @pytest.fixture + def logger(self, string_io): + handler = logging.StreamHandler(string_io) + handler.setFormatter(LogFormatter(use_json=True)) + logger = logging.getLogger("test_batched") + logger.handlers = [handler] + logger.setLevel(logging.INFO) + return logger + + @pytest.fixture + def batched_logger(self, logger): + return BatchedEventLogger(logger) + + def test_batch_single_event(self, batched_logger, string_io): + event_data = {"action": "click", "element": "button"} + batched_logger.add_event("ui_action", event_data) + batched_logger.flush() + + output = string_io.getvalue() + parsed = json.loads(output) + + assert parsed["event_type"] == "batched_ui_action" + assert parsed["data"]["count"] == 1 + assert parsed["data"]["events"][0] == event_data + + def test_batch_multiple_events(self, batched_logger, string_io): + events = [ + {"action": "click", "element": "button1"}, + {"action": "type", "element": "input1"}, + {"action": "click", "element": "button2"} + ] + + for event in events: + batched_logger.add_event("ui_action", event) + + batched_logger.flush() + + output = string_io.getvalue() + parsed = json.loads(output) + + assert parsed["event_type"] == "batched_ui_action" + assert parsed["data"]["count"] == 3 + assert parsed["data"]["events"] == events + +class TestLoggingSetup: + @pytest.fixture + def temp_logger(self): + # Store original handlers + root_logger = logging.getLogger() + original_handlers = root_logger.handlers[:] + + yield root_logger + + # Restore original handlers + root_logger.handlers = original_handlers + + def test_setup_basic_logging(self, temp_logger): + setup_logging(level="INFO", use_json=True) + assert len(temp_logger.handlers) == 1 + assert isinstance(temp_logger.handlers[0].formatter, LogFormatter) + assert temp_logger.level == logging.INFO + + def test_setup_with_exclude_patterns(self, temp_logger): + test_patterns = ["debug", "deprecated"] + setup_logging(level="INFO", exclude_patterns=test_patterns) + + # Create a test record that should be filtered + record = logging.LogRecord( + name="test_logger", + level=logging.INFO, + pathname="test.py", + lineno=1, + msg="This is a debug message", + args=(), + exc_info=None + ) + + # The record should be filtered out + assert not temp_logger.handlers[0].filter(record) + + def test_production_exclude_patterns(self): + # Verify that all production patterns are strings + assert all(isinstance(pattern, str) for pattern in PRODUCTION_EXCLUDE_PATTERNS) + + # Verify that common patterns are included + common_patterns = ["deprecated", "virtual environment"] + assert all(pattern in PRODUCTION_EXCLUDE_PATTERNS for pattern in common_patterns) + +def test_log_levels(): + # Test that all expected log levels are defined + expected_levels = ["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "TRACE"] + assert all(level in LogLevel.__members__ for level in expected_levels) + + # Test that the values match the names + for level in LogLevel: + assert level.value == level.name \ No newline at end of file diff --git a/tests/test_logging_integration.py b/tests/test_logging_integration.py new file mode 100644 index 00000000..3fa7a1f5 --- /dev/null +++ b/tests/test_logging_integration.py @@ -0,0 +1,219 @@ +import json +import logging +import pytest +import asyncio +from pathlib import Path +from io import StringIO +from typing import Dict, Any, List, Optional + +from src.utils.logging import LogFormatter, BatchedEventLogger, setup_logging +from src.agent.custom_agent import CustomAgent +from browser_use.agent.views import ActionResult +from browser_use.browser.views import BrowserStateHistory +from browser_use.browser.browser import Browser, BrowserConfig +from browser_use.browser.context import BrowserContext, BrowserContextConfig +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.messages import BaseMessage + +class MockElementTree: + def clickable_elements_to_string(self, include_attributes=None): + return "Mock clickable elements" + +class MockBrowserContext(BrowserContext): + def __init__(self): + self.config = BrowserContextConfig() + self.selector_map = {} + self.cached_state = BrowserStateHistory( + url="https://example.com", + title="Example Page", + tabs=[], + interacted_element=[None], + screenshot=None + ) + setattr(self.cached_state, 'selector_map', self.selector_map) + setattr(self.cached_state, 'element_tree', MockElementTree()) + + async def get_state(self, use_vision=True): + return self.cached_state + + async def close(self): + pass + + def __del__(self): + # Override to prevent errors about missing session attribute + pass + +class MockBrowser(Browser): + def __init__(self): + self.config = BrowserConfig() + + async def new_context(self, config): + return MockBrowserContext() + + async def close(self): + pass + +class MockLLM(BaseChatModel): + def with_structured_output(self, output_type, include_raw=False): + self._output_type = output_type + return self + + async def ainvoke(self, messages: List[BaseMessage], **kwargs): + return { + 'parsed': self._output_type( + action=[], + current_state={ + 'prev_action_evaluation': 'Success', + 'important_contents': 'Test memory', + 'completed_contents': 'Test progress', + 'thought': 'Test thought', + 'summary': 'Test summary' + } + ) + } + + @property + def _llm_type(self) -> str: + return "mock" + + def _generate(self, messages: List[BaseMessage], stop: Optional[List[str]] = None, run_manager = None, **kwargs): + raise NotImplementedError("Use ainvoke instead") + + @property + def _identifying_params(self) -> Dict[str, Any]: + return {"mock_param": True} + +class ErrorLLM(MockLLM): + async def ainvoke(self, messages: List[BaseMessage], **kwargs): + raise ValueError("Test error") + +class ActionLLM(MockLLM): + async def ainvoke(self, messages: List[BaseMessage], **kwargs): + return { + 'parsed': self._output_type( + action=[ + {'type': 'click', 'selector': '#button1'}, + {'type': 'type', 'selector': '#input1', 'text': 'test'}, + ], + current_state={ + 'prev_action_evaluation': 'Success', + 'important_contents': 'Test memory', + 'completed_contents': 'Test progress', + 'thought': 'Test thought', + 'summary': 'Test summary' + } + ) + } + +@pytest.fixture +def logger(): + # Configure root logger + root_logger = logging.getLogger() + root_logger.setLevel(logging.INFO) + + # Configure test logger + logger = logging.getLogger("test_integration") + logger.setLevel(logging.INFO) + return logger + +@pytest.fixture +def string_io(): + string_io = StringIO() + handler = logging.StreamHandler(string_io) + handler.setFormatter(LogFormatter(use_json=True)) + + # Add handler to root logger + root_logger = logging.getLogger() + root_logger.addHandler(handler) + + # Add handler to test logger + logger = logging.getLogger("test_integration") + logger.addHandler(handler) + + yield string_io + + # Clean up + root_logger.removeHandler(handler) + logger.removeHandler(handler) + +@pytest.mark.asyncio +async def test_agent_logging_integration(logger, string_io): + # Setup + agent = CustomAgent( + task="Test task", + llm=MockLLM(), + browser=MockBrowser(), + browser_context=MockBrowserContext(), + use_vision=True + ) + + # Execute a step + await agent.step() + + # Get all log output + log_output = string_io.getvalue() + log_entries = [json.loads(line) for line in log_output.strip().split('\n') if line.strip()] + + # Print log entries for debugging + print("\nLog entries:", log_entries) + + # Verify log entries + assert len(log_entries) > 0 + assert any('Starting step 1' in str(entry.get('msg', '')) for entry in log_entries) + assert any('Model Response: success' in str(entry.get('msg', '')) for entry in log_entries) + assert any('Step error' in str(entry.get('msg', '')) for entry in log_entries) + +@pytest.mark.asyncio +async def test_agent_error_logging(logger, string_io): + # Setup + agent = CustomAgent( + task="Test task", + llm=ErrorLLM(), + browser=MockBrowser(), + browser_context=MockBrowserContext(), + use_vision=True + ) + + # Execute a step + await agent.step() + + # Get all log output + log_output = string_io.getvalue() + log_entries = [json.loads(line) for line in log_output.strip().split('\n') if line.strip()] + + # Print log entries for debugging + print("\nLog entries:", log_entries) + + # Verify log entries + assert len(log_entries) > 0 + assert any('Starting step 1' in str(entry.get('msg', '')) for entry in log_entries) + assert any('Step error' in str(entry.get('msg', '')) for entry in log_entries) + assert any('Use ainvoke instead' in str(entry.get('msg', '')) for entry in log_entries) + +@pytest.mark.asyncio +async def test_agent_batched_logging(logger, string_io): + # Setup + agent = CustomAgent( + task="Test task", + llm=ActionLLM(), + browser=MockBrowser(), + browser_context=MockBrowserContext(), + use_vision=True + ) + + # Execute a step + await agent.step() + + # Get all log output + log_output = string_io.getvalue() + log_entries = [json.loads(line) for line in log_output.strip().split('\n') if line.strip()] + + # Print log entries for debugging + print("\nLog entries:", log_entries) + + # Verify log entries + assert len(log_entries) > 0 + assert any('Starting step 1' in str(entry.get('msg', '')) for entry in log_entries) + assert any('Model Response: success' in str(entry.get('msg', '')) for entry in log_entries) + assert any('Batch: 2 action events' in str(entry.get('msg', '')) for entry in log_entries) + assert any('Step error' in str(entry.get('msg', '')) for entry in log_entries) \ No newline at end of file diff --git a/tests/test_structured_logging.py b/tests/test_structured_logging.py new file mode 100644 index 00000000..9134c4f5 --- /dev/null +++ b/tests/test_structured_logging.py @@ -0,0 +1,270 @@ +import pytest +import json +import logging +import os +from datetime import datetime +from src.utils.structured_logging import ( + StructuredLogger, + ProgressEvent, + BrowserEvent, + JSONFormatter, + ColorizedFormatter, + ColorScheme, + setup_structured_logging +) +from colorama import Fore, Style + +@pytest.fixture +def structured_logger(): + logger = StructuredLogger("test_logger") + return logger + +def test_progress_event_creation(): + event = ProgressEvent( + step="test_step", + status="in_progress", + progress=0.5, + message="Testing progress" + ) + assert event.step == "test_step" + assert event.status == "in_progress" + assert event.progress == 0.5 + assert event.message == "Testing progress" + assert event.timestamp is not None + +def test_browser_event_creation(): + details = {"action": "click", "selector": "#button"} + event = BrowserEvent( + event_type="interaction", + details=details + ) + assert event.event_type == "interaction" + assert event.details == details + assert event.timestamp is not None + +def test_progress_logging(structured_logger): + structured_logger.log_progress( + step="test_step", + status="started", + progress=0.0, + message="Starting test" + ) + + history = structured_logger.get_progress_history() + assert len(history) == 1 + assert history[0]["step"] == "test_step" + assert history[0]["status"] == "started" + assert history[0]["progress"] == 0.0 + assert history[0]["message"] == "Starting test" + +def test_browser_event_logging(structured_logger): + details = {"page": "test.html", "action": "navigate"} + structured_logger.log_browser_event( + event_type="navigation", + details=details + ) + + events = structured_logger.get_browser_events() + assert len(events) == 1 + assert events[0]["event_type"] == "navigation" + assert events[0]["details"] == details + +def test_progress_tracking(structured_logger): + # Test multiple progress updates + steps = [ + ("step1", "started", 0.0, "Starting"), + ("step1", "in_progress", 0.5, "Halfway"), + ("step1", "completed", 1.0, "Done") + ] + + for step, status, progress, message in steps: + structured_logger.log_progress(step, status, progress, message) + + assert structured_logger.get_current_progress() == 1.0 + history = structured_logger.get_progress_history() + assert len(history) == 3 + + for i, (step, status, progress, message) in enumerate(steps): + assert history[i]["step"] == step + assert history[i]["status"] == status + assert history[i]["progress"] == progress + assert history[i]["message"] == message + +def test_clear_history(structured_logger): + # Add some events + structured_logger.log_progress("test", "started", 0.5, "Test progress") + structured_logger.log_browser_event("test", {"action": "test"}) + + # Clear history + structured_logger.clear_history() + + assert len(structured_logger.get_progress_history()) == 0 + assert len(structured_logger.get_browser_events()) == 0 + assert structured_logger.get_current_progress() == 0.0 + +def test_json_formatter(): + formatter = JSONFormatter() + record = logging.LogRecord( + name="test_logger", + level=logging.INFO, + pathname="test.py", + lineno=1, + msg="Test message", + args=(), + exc_info=None + ) + + # Add custom fields + setattr(record, 'event_type', 'test_event') + setattr(record, 'data', {'test_key': 'test_value'}) + + formatted = formatter.format(record) + parsed = json.loads(formatted) + + assert parsed["level"] == "INFO" + assert parsed["message"] == "Test message" + assert parsed["logger"] == "test_logger" + assert parsed["event_type"] == "test_event" + assert parsed["data"] == {"test_key": "test_value"} + assert "timestamp" in parsed + +def test_colorized_formatter_with_colors(): + formatter = ColorizedFormatter(use_colors=True) + record = logging.LogRecord( + name="test_logger", + level=logging.ERROR, + pathname="test.py", + lineno=1, + msg="Test error message", + args=(), + exc_info=None + ) + + formatted = formatter.format(record) + assert Fore.RED in formatted # Error level should be red + assert Style.RESET_ALL in formatted # Should have reset codes + assert "[" in formatted and "]" in formatted # Should have timestamp brackets + assert "ERROR" in formatted # Should include level name + +def test_colorized_formatter_without_colors(): + formatter = ColorizedFormatter(use_colors=False) + record = logging.LogRecord( + name="test_logger", + level=logging.INFO, + pathname="test.py", + lineno=1, + msg="Test message", + args=(), + exc_info=None + ) + + formatted = formatter.format(record) + assert Fore.CYAN not in formatted # Should not have color codes + assert Style.RESET_ALL not in formatted + assert "[" in formatted and "]" in formatted + assert "INFO" in formatted + +def test_colorized_formatter_special_keywords(): + formatter = ColorizedFormatter(use_colors=True) + record = logging.LogRecord( + name="test_logger", + level=logging.INFO, + pathname="test.py", + lineno=1, + msg="✓ STEP(1) completed × failed", + args=(), + exc_info=None + ) + + formatted = formatter.format(record) + assert Fore.GREEN in formatted # Success checkmark + assert Fore.BLUE in formatted # STEP keyword + assert Fore.RED in formatted # Error cross + +def test_colorized_formatter_with_structured_data(): + formatter = ColorizedFormatter(use_colors=True) + record = logging.LogRecord( + name="test_logger", + level=logging.INFO, + pathname="test.py", + lineno=1, + msg="Progress Update", + args=(), + exc_info=None + ) + + # Add structured data + setattr(record, 'event_type', 'progress') + setattr(record, 'data', {'step': 'test', 'progress': 0.5}) + + formatted = formatter.format(record) + assert 'progress' in formatted + assert '"step": "test"' in formatted + assert '"progress": 0.5' in formatted + +def test_color_scheme(): + scheme = ColorScheme() + assert scheme.ERROR == Fore.RED + assert scheme.WARNING == Fore.YELLOW + assert scheme.INFO == Fore.CYAN + assert scheme.DEBUG == Style.DIM + assert scheme.SUCCESS == Fore.GREEN + assert scheme.RESET == Style.RESET_ALL + +def test_no_color_environment_variable(): + os.environ['NO_COLOR'] = '1' + formatter = ColorizedFormatter(use_colors=True) # Even with colors enabled + record = logging.LogRecord( + name="test_logger", + level=logging.ERROR, + pathname="test.py", + lineno=1, + msg="Test message", + args=(), + exc_info=None + ) + + formatted = formatter.format(record) + assert Fore.RED not in formatted # Should not have color codes + assert Style.RESET_ALL not in formatted + + # Clean up + del os.environ['NO_COLOR'] + +def test_setup_structured_logging_with_colors(): + # Remove existing handlers + root_logger = logging.getLogger() + for handler in root_logger.handlers[:]: + root_logger.removeHandler(handler) + + # Set up logging with colors + setup_structured_logging(level=logging.DEBUG, use_colors=True, json_output=False) + + assert len(root_logger.handlers) == 1 + assert isinstance(root_logger.handlers[0].formatter, ColorizedFormatter) + assert root_logger.handlers[0].formatter.use_colors is True + +def test_setup_structured_logging_json(): + # Remove existing handlers + root_logger = logging.getLogger() + for handler in root_logger.handlers[:]: + root_logger.removeHandler(handler) + + # Set up logging with JSON output + setup_structured_logging(level=logging.DEBUG, json_output=True) + + assert len(root_logger.handlers) == 1 + assert isinstance(root_logger.handlers[0].formatter, JSONFormatter) + +def test_setup_structured_logging(): + # Remove existing handlers + root_logger = logging.getLogger() + for handler in root_logger.handlers[:]: + root_logger.removeHandler(handler) + + # Set up logging with default settings + setup_structured_logging(level=logging.DEBUG) + + assert root_logger.level == logging.DEBUG + assert len(root_logger.handlers) == 1 + assert isinstance(root_logger.handlers[0].formatter, ColorizedFormatter) # Default to ColorizedFormatter \ No newline at end of file diff --git a/tests/test_task_logging.py b/tests/test_task_logging.py new file mode 100644 index 00000000..50bf5aae --- /dev/null +++ b/tests/test_task_logging.py @@ -0,0 +1,641 @@ +import pytest +from datetime import datetime, timedelta +import json +import asyncio +import os +from src.utils.task_logging import ( + TaskLogger, + TaskContext, + StepInfo, + BrowserState, + TaskStatus, + PerformanceMetrics, + ErrorInfo, + ActionType, + RetryConfig, + RetryInfo, + ColorScheme, + LogFormatter, + SeparatorStyle +) + +def test_task_logger_initialization(): + logger = TaskLogger("test_task", "Test task goal") + context = logger.get_context() + + assert context["task"]["id"] == "test_task" + assert context["task"]["goal"] == "Test task goal" + assert context["task"]["status"] == "pending" + assert context["browser"]["url"] == "" + assert context["browser"]["state"] == "loading" + assert context["browser"]["visible_elements"] == 0 + assert context["browser"]["dynamic_content"] == "loading" + +def test_step_update(): + logger = TaskLogger("test_task", "Test task goal") + + # Update to running state + logger.update_step("Starting navigation", TaskStatus.RUNNING) + context = logger.get_context() + + assert context["task"]["status"] == "running" + assert context["task"]["progress"] == "2/unknown steps" # Step number incremented + + # Update to complete state + logger.update_step("Navigation complete", TaskStatus.COMPLETE) + context = logger.get_context() + + assert context["task"]["status"] == "complete" + assert context["task"]["progress"] == "3/unknown steps" + +def test_browser_state_update(): + logger = TaskLogger("test_task", "Test task goal") + + # Update browser state + logger.update_browser_state( + url="https://example.com", + page_ready=True, + dynamic_content_loaded=True, + visible_elements=10 + ) + + context = logger.get_context() + assert context["browser"]["url"] == "https://example.com" + assert context["browser"]["state"] == "ready" + assert context["browser"]["dynamic_content"] == "loaded" + assert context["browser"]["visible_elements"] == 10 + +def test_partial_browser_state_update(): + logger = TaskLogger("test_task", "Test task goal") + + # Update only some fields + logger.update_browser_state(url="https://example.com") + context = logger.get_context() + + assert context["browser"]["url"] == "https://example.com" + assert context["browser"]["state"] == "loading" # Unchanged + assert context["browser"]["dynamic_content"] == "loading" # Unchanged + assert context["browser"]["visible_elements"] == 0 # Unchanged + +def test_elapsed_time_calculation(): + logger = TaskLogger("test_task", "Test task goal") + + # Set a specific start time + start_time = datetime.utcnow() - timedelta(seconds=5) + logger.context.started_at = start_time.isoformat() + + context = logger.get_context() + elapsed_time = float(context["task"]["elapsed_time"].rstrip("s")) + + assert 4.5 <= elapsed_time <= 5.5 # Allow for small timing variations + +def test_task_status_validation(): + logger = TaskLogger("test_task", "Test task goal") + + # Test all valid status values + for status in TaskStatus: + logger.update_step(f"Step with status {status}", status) + context = logger.get_context() + assert context["task"]["status"] == status.value + +def test_json_serialization(): + logger = TaskLogger("test_task", "Test task goal") + context = logger.get_context() + + # Verify that the context can be JSON serialized + json_str = json.dumps(context) + parsed = json.loads(json_str) + + assert parsed["task"]["id"] == "test_task" + assert parsed["task"]["goal"] == "Test task goal" + assert "timestamp" in parsed + assert "elapsed_time" in parsed["task"] + +def test_step_info_status_conversion(): + # Test that string status values are converted to TaskStatus enum + step = StepInfo( + number=1, + description="Test step", + started_at=datetime.utcnow().isoformat(), + status="running" # Pass as string + ) + + assert isinstance(step.status, TaskStatus) + assert step.status == TaskStatus.RUNNING + +def test_error_handling(): + logger = TaskLogger("error_task", "Test error handling") + + # Simulate an error + error = ValueError("Test error") + logger.log_error(error, step_number=1, action="test action") + + context = logger.get_context() + assert context["task"]["status"] == "failed" + assert context["error"]["message"] == "Test error" + assert context["error"]["type"] == "ValueError" + assert context["error"]["step"] == 1 + assert context["error"]["action"] == "test action" + +def test_performance_metrics(): + logger = TaskLogger("perf_task", "Test performance tracking") + + # Start tracking performance + logger.start_performance_tracking() + + # Simulate some steps with timing + logger.update_step("Navigation", TaskStatus.RUNNING) + logger.track_step_duration("navigation", 0.5) + + logger.update_step("Interaction", TaskStatus.RUNNING) + logger.track_step_duration("interaction", 0.3) + + # Get performance metrics + metrics = logger.get_performance_metrics() + assert metrics["step_breakdown"]["navigation"] == pytest.approx(0.5) + assert metrics["step_breakdown"]["interaction"] == pytest.approx(0.3) + assert metrics["total_duration"] > 0 + +def test_detailed_browser_state(): + logger = TaskLogger("browser_task", "Test browser state") + + # Update with detailed browser state + logger.update_browser_state( + url="https://example.com", + page_ready=True, + dynamic_content_loaded=True, + visible_elements=10, + current_frame="main", + active_element="search_input", + page_title="Example Page" + ) + + context = logger.get_context() + browser_state = context["browser"] + assert browser_state["url"] == "https://example.com" + assert browser_state["state"] == "ready" + assert browser_state["current_frame"] == "main" + assert browser_state["active_element"] == "search_input" + assert browser_state["page_title"] == "Example Page" + +def test_task_progress_tracking(): + logger = TaskLogger("progress_task", "Test progress tracking") + + # Add steps with progress information + logger.update_step("Step 1", TaskStatus.COMPLETE, progress=0.25) + context = logger.get_context() + assert context["task"]["progress"] == "25%" + + logger.update_step("Step 2", TaskStatus.COMPLETE, progress=0.5) + context = logger.get_context() + assert context["task"]["progress"] == "50%" + + logger.update_step("Final Step", TaskStatus.COMPLETE, progress=1.0) + context = logger.get_context() + assert context["task"]["progress"] == "100%" + +def test_log_formatting(): + logger = TaskLogger("format_task", "Test log formatting") + + # Capture log output + logger.update_step("Navigation", TaskStatus.RUNNING) + log_output = logger.format_log_entry() + + # Verify log format matches the specified structure + assert "[" in log_output # Has timestamp + assert "STEP 2/" in log_output # Has step number (2 because update_step increments) + assert "Navigation" in log_output # Has action + assert "→" in log_output # Has status symbol for running + + # Add another step to test duration + logger.update_step("Click button", TaskStatus.COMPLETE) + log_output = logger.format_log_entry() + assert "(" in log_output and "s)" in log_output # Now we should have duration + +def test_semantic_step_descriptions(): + logger = TaskLogger("semantic_task", "Test semantic descriptions") + + # Test navigation step + logger.update_step( + "Navigate to example.com", + TaskStatus.RUNNING, + action_type=ActionType.NAVIGATION + ) + context = logger.get_context() + assert context["task"]["current_action"] == "navigation" + assert "🌐" in logger.format_log_entry() # Navigation emoji + + # Test interaction step + logger.update_step( + "Click search button", + TaskStatus.RUNNING, + action_type=ActionType.INTERACTION + ) + context = logger.get_context() + assert context["task"]["current_action"] == "interaction" + assert "🖱️" in logger.format_log_entry() # Interaction emoji + + # Test extraction step + logger.update_step( + "Extract search results", + TaskStatus.RUNNING, + action_type=ActionType.EXTRACTION + ) + context = logger.get_context() + assert context["task"]["current_action"] == "extraction" + assert "📑" in logger.format_log_entry() # Extraction emoji + +def test_redundant_message_filtering(): + logger = TaskLogger("filter_task", "Test message filtering") + + # Add multiple steps of the same type + logger.update_step( + "Navigate to example.com", + TaskStatus.RUNNING, + action_type=ActionType.NAVIGATION + ) + logger.update_step( + "Page loaded successfully", + TaskStatus.COMPLETE, + action_type=ActionType.NAVIGATION, + suppress_similar=True # Should be filtered as it's a completion of the same action + ) + + # Get all log entries + log_entries = logger.get_log_history() + + # Verify that redundant messages are consolidated + navigation_entries = [entry for entry in log_entries if "Navigate" in entry] + assert len(navigation_entries) == 1 # Only the main action should be logged + + # Verify that the current step shows the completion status + current_log = logger.format_log_entry() + assert "✓" in current_log # Success symbol should be in current state + +def test_action_context_tracking(): + logger = TaskLogger("context_task", "Test action context") + + # Start a navigation action + logger.update_step( + "Navigate to example.com", + TaskStatus.RUNNING, + action_type=ActionType.NAVIGATION, + context={ + "url": "https://example.com", + "method": "GET", + "headers": {"User-Agent": "browser-use"} + } + ) + + context = logger.get_context() + assert "action_context" in context["task"] + assert context["task"]["action_context"]["url"] == "https://example.com" + + # Complete the action with results + logger.update_step( + "Navigation complete", + TaskStatus.COMPLETE, + action_type=ActionType.NAVIGATION, + results={ + "status_code": 200, + "page_title": "Example Domain", + "load_time": 0.5 + } + ) + + context = logger.get_context() + assert "action_results" in context["task"] + assert context["task"]["action_results"]["status_code"] == 200 + +def test_retry_configuration(): + config = RetryConfig( + max_retries=3, + base_delay=1.0, + max_delay=10.0, + jitter=0.1 + ) + + # Test that delays follow exponential backoff pattern + delays = [config.get_delay(attempt) for attempt in range(5)] + assert delays[0] == 0 # First attempt has no delay + assert 0.9 <= delays[1] <= 1.1 # First retry ~1.0s with jitter + assert 1.8 <= delays[2] <= 2.2 # Second retry ~2.0s with jitter + assert 3.6 <= delays[3] <= 4.4 # Third retry ~4.0s with jitter + assert delays[4] == -1 # Beyond max retries + + # Test max delay capping + config = RetryConfig( + max_retries=5, + base_delay=1.0, + max_delay=5.0, + jitter=0.0 # Disable jitter for predictable testing + ) + assert config.get_delay(3) == 4.0 # Within max + assert config.get_delay(4) == 5.0 # Capped at max + +@pytest.mark.asyncio +async def test_retry_execution(): + logger = TaskLogger("retry_task", "Test retry logic") + + # Mock function that fails twice then succeeds + attempt_count = 0 + async def mock_operation(): + nonlocal attempt_count + attempt_count += 1 + if attempt_count <= 2: + raise ValueError("Temporary error") + return "success" + + # Configure retry behavior + retry_config = RetryConfig(max_retries=3, base_delay=0.1) + + # Execute with retry + result = await logger.execute_with_retry( + mock_operation, + "test_operation", + retry_config=retry_config + ) + + assert result == "success" + assert attempt_count == 3 # Two failures + one success + + # Verify retry information in logs + context = logger.get_context() + assert "retries" in context["task"] + retry_info = context["task"]["retries"] + assert retry_info["attempts"] == 3 + assert retry_info["success"] is True + assert len(retry_info["history"]) == 2 # Two retry attempts + +@pytest.mark.asyncio +async def test_retry_max_attempts_exceeded(): + logger = TaskLogger("retry_task", "Test retry logic") + + # Mock function that always fails + async def mock_operation(): + raise ValueError("Persistent error") + + # Configure retry behavior + retry_config = RetryConfig(max_retries=2, base_delay=0.1) + + # Execute with retry and expect failure + with pytest.raises(ValueError) as exc_info: + await logger.execute_with_retry( + mock_operation, + "test_operation", + retry_config=retry_config + ) + + assert str(exc_info.value) == "Persistent error" + + # Verify retry information in logs + context = logger.get_context() + assert "retries" in context["task"] + retry_info = context["task"]["retries"] + assert retry_info["attempts"] == 3 # Initial + 2 retries + assert retry_info["success"] is False + assert len(retry_info["history"]) == 3 # Initial attempt + two retries + assert all(entry["error"] == "ValueError: Persistent error" for entry in retry_info["history"]) + + # Verify the delays follow the expected pattern + delays = [entry["delay"] for entry in retry_info["history"]] + assert delays[0] > 0 # First retry has positive delay + assert delays[1] > delays[0] # Second retry has longer delay + assert delays[2] == -1 # Final attempt indicates max retries exceeded + +def test_retry_backoff_calculation(): + config = RetryConfig( + max_retries=3, + base_delay=1.0, + max_delay=10.0, + jitter=0.0 # Disable jitter for predictable testing + ) + + # Test exponential backoff sequence + assert config.get_delay(0) == 0 # First attempt + assert config.get_delay(1) == 1.0 # First retry + assert config.get_delay(2) == 2.0 # Second retry + assert config.get_delay(3) == 4.0 # Third retry + assert config.get_delay(4) == -1 # Beyond max retries + + # Test max delay capping + config = RetryConfig( + max_retries=5, + base_delay=1.0, + max_delay=5.0, + jitter=0.0 + ) + assert config.get_delay(3) == 4.0 # Within max + assert config.get_delay(4) == 5.0 # Capped at max + +def test_color_scheme(): + """Test that color scheme is properly defined and accessible.""" + scheme = ColorScheme() + + # Test error colors + assert scheme.error.startswith("\033[31m") # Red + assert scheme.warning.startswith("\033[33m") # Yellow + assert scheme.info.startswith("\033[36m") # Cyan + assert scheme.success.startswith("\033[32m") # Green + assert scheme.reset == "\033[0m" # Reset + +def test_log_formatting_with_colors(): + """Test that log messages are properly formatted with colors.""" + logger = TaskLogger("color_task", "Test color formatting") + + # Test error formatting + logger.update_step("Failed operation", TaskStatus.FAILED) + log_output = logger.format_log_entry() + assert "\033[31m" in log_output # Contains red color code + assert "×" in log_output # Contains error symbol + + # Test success formatting + logger.update_step("Successful operation", TaskStatus.COMPLETE) + log_output = logger.format_log_entry() + assert "\033[32m" in log_output # Contains green color code + assert "✓" in log_output # Contains success symbol + + # Test running state formatting + logger.update_step("Running operation", TaskStatus.RUNNING) + log_output = logger.format_log_entry() + assert "\033[36m" in log_output # Contains cyan color code + assert "→" in log_output # Contains running symbol + +def test_color_disabled(): + """Test that colors can be disabled via environment variable.""" + os.environ["NO_COLOR"] = "1" + logger = TaskLogger("no_color_task", "Test without colors") + + logger.update_step("Test operation", TaskStatus.COMPLETE) + log_output = logger.format_log_entry() + + # Verify no color codes are present + assert "\033[" not in log_output + assert "✓" in log_output # Symbols still present + + # Clean up + del os.environ["NO_COLOR"] + +def test_color_scheme_customization(): + """Test that color scheme can be customized.""" + custom_scheme = ColorScheme( + error="\033[35m", # Magenta for errors + warning="\033[34m", # Blue for warnings + info="\033[37m", # White for info + success="\033[32m" # Keep green for success + ) + + logger = TaskLogger("custom_color_task", "Test custom colors", color_scheme=custom_scheme) + + # Test custom error color + logger.update_step("Failed operation", TaskStatus.FAILED) + log_output = logger.format_log_entry() + assert "\033[35m" in log_output # Contains magenta color code + + # Test custom info color + logger.update_step("Info message", TaskStatus.RUNNING) + log_output = logger.format_log_entry() + assert "\033[37m" in log_output # Contains white color code + +def test_log_formatter_with_colors(): + """Test that the log formatter properly applies colors to different components.""" + formatter = LogFormatter() + + # Create a mock log record + class MockRecord: + def __init__(self, levelname, msg): + self.levelname = levelname + self.msg = msg + self.created = datetime.utcnow().timestamp() + + # Test error formatting + error_record = MockRecord("ERROR", "Test error message") + formatted = formatter.format(error_record) + assert "\033[31m" in formatted # Red for error + assert "ERROR" in formatted + + # Test info formatting + info_record = MockRecord("INFO", "Test info message") + formatted = formatter.format(info_record) + assert "\033[36m" in formatted # Cyan for info + assert "INFO" in formatted + + # Test warning formatting + warn_record = MockRecord("WARNING", "Test warning message") + formatted = formatter.format(warn_record) + assert "\033[33m" in formatted # Yellow for warning + assert "WARNING" in formatted + +def test_task_separator_style(): + """Test that separator styles are properly defined and formatted.""" + style = SeparatorStyle() + + # Test default separator styles + assert len(style.task) >= 50 # Task separator should be substantial + assert len(style.phase) >= 30 # Phase separator should be visible but less prominent + assert len(style.error) >= 40 # Error separator should be distinct + + # Test that styles are different + assert style.task != style.phase + assert style.task != style.error + assert style.phase != style.error + +def test_task_start_separator(): + """Test that separators are added at task start.""" + logger = TaskLogger("separator_task", "Test separators") + + # Get initial log output + log_entries = logger.get_log_history() + + # Should have task separator and initial step + assert len(log_entries) == 2 + assert "=" * 50 in log_entries[0] # Task separator + assert "TASK GOAL: Test separators" in log_entries[1] # Initial step message + +def test_phase_separators(): + """Test that separators are added between different phases.""" + logger = TaskLogger("separator_task", "Test separators") + + # Navigation phase + logger.start_phase("Navigation") + logger.update_step("Navigate to example.com", TaskStatus.COMPLETE, action_type=ActionType.NAVIGATION) + + # Interaction phase + logger.start_phase("Interaction") + logger.update_step("Click button", TaskStatus.COMPLETE, action_type=ActionType.INTERACTION) + + # Get log entries + log_entries = logger.get_log_history() + + # Count phase separators + phase_separators = [entry for entry in log_entries if "-" * 30 in entry] + assert len(phase_separators) == 2 # One before each phase + +def test_error_separators(): + """Test that separators are added around error messages.""" + logger = TaskLogger("separator_task", "Test separators") + + # Simulate an error + try: + raise ValueError("Test error") + except Exception as e: + logger.log_error(e, step_number=1, action="test_action") + + # Get log entries + log_entries = logger.get_log_history() + + # Should have error separators + error_separators = [entry for entry in log_entries if "*" * 40 in entry] + assert len(error_separators) == 2 # One before and one after error + +def test_custom_separator_style(): + """Test that separator styles can be customized.""" + custom_style = SeparatorStyle( + task="◈" * 30, + phase="•" * 20, + error="!" * 25 + ) + + logger = TaskLogger("custom_separator_task", "Test custom separators", separator_style=custom_style) + + # Start a phase + logger.start_phase("Test Phase") + + # Get log entries + log_entries = logger.get_log_history() + + # Verify custom separators are used + assert "◈" * 30 in log_entries[0] # Task separator + assert "•" * 20 in log_entries[2] # Phase separator + assert "→" in log_entries[2] # Arrow indicator for phase start + +def test_separator_with_colors(): + """Test that separators can be colored.""" + logger = TaskLogger("colored_separator_task", "Test colored separators") + + # Start a phase + logger.start_phase("Test Phase") + + # Get log entries + log_entries = logger.get_log_history() + + # Verify separators have color codes + task_separator = log_entries[0] + phase_separator = log_entries[1] + + assert "\033[" in task_separator # Contains color code + assert "\033[" in phase_separator # Contains color code + +def test_separator_disabled(): + """Test that separators can be disabled.""" + logger = TaskLogger("no_separator_task", "Test without separators", use_separators=False) + + # Start a phase + logger.start_phase("Test Phase") + + # Get log entries + log_entries = logger.get_log_history() + + # Verify no separators are present + separators = [entry for entry in log_entries if any(c * 20 in entry for c in "=-*")] + assert len(separators) == 0 # No separators should be present \ No newline at end of file diff --git a/tests/test_trace_analyzer.py b/tests/test_trace_analyzer.py new file mode 100644 index 00000000..841d180d --- /dev/null +++ b/tests/test_trace_analyzer.py @@ -0,0 +1,162 @@ +import pytest +import asyncio +import json +import zipfile +from pathlib import Path +import tempfile +from src.trace_analyzer import PlaywrightTrace, analyze_trace + +# Sample trace data +SAMPLE_TRACE_DATA = [ + # Action event (before) + { + "type": "before", + "method": "goto", + "params": {"url": "https://example.com"}, + "timestamp": 1000, + "duration": 500 + }, + # Action event (after - success) + { + "type": "after", + "method": "goto", + "params": {"url": "https://example.com"}, + "timestamp": 1500, + "duration": 500 + }, + # Action event (after - error) + { + "type": "after", + "method": "click", + "params": {"selector": "#missing-button"}, + "timestamp": 2000, + "duration": 100, + "error": {"message": "Element not found"} + }, + # Console event + { + "type": "console", + "text": "Test console message" + }, + # Error event + { + "type": "error", + "error": {"message": "Test error message"} + } +] + +# Sample HAR data +SAMPLE_HAR_DATA = { + "log": { + "entries": [ + { + "request": { + "url": "https://example.com", + "method": "GET" + }, + "response": { + "status": 200, + "statusText": "OK" + }, + "time": 150 + }, + { + "request": { + "url": "https://example.com/missing", + "method": "GET" + }, + "response": { + "status": 404, + "statusText": "Not Found" + }, + "time": 100 + } + ] + } +} + +@pytest.fixture +def sample_trace_file(): + """Create a temporary trace file with sample data.""" + with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as temp_zip: + with zipfile.ZipFile(temp_zip.name, 'w') as zf: + # Add trace data + trace_data = '\n'.join(json.dumps(event) for event in SAMPLE_TRACE_DATA) + zf.writestr('trace.trace', trace_data) + + # Add HAR data + zf.writestr('trace.har', json.dumps(SAMPLE_HAR_DATA)) + + yield temp_zip.name + Path(temp_zip.name).unlink() + +@pytest.mark.asyncio +async def test_trace_parsing(sample_trace_file): + """Test basic trace file parsing.""" + trace = await PlaywrightTrace.parse(sample_trace_file) + + # Check actions + assert len(trace.actions) == 3 + assert any(a['type'] == 'goto' and a['success'] for a in trace.actions) + assert any(a['type'] == 'click' and not a['success'] for a in trace.actions) + + # Check console logs + assert len(trace.console_logs) == 1 + assert trace.console_logs[0] == "Test console message" + + # Check errors + assert len(trace.errors) == 1 + assert "Test error message" in trace.errors[0] + + # Check network requests + assert len(trace.network_requests) == 2 + assert any(r['status'] == 200 for r in trace.network_requests) + assert any(r['status'] == 404 for r in trace.network_requests) + +@pytest.mark.asyncio +async def test_analyze_trace(sample_trace_file): + """Test the analyze_trace function.""" + result = await analyze_trace(sample_trace_file) + + assert "actions" in result + assert "network_requests" in result + assert "console_logs" in result + assert "errors" in result + assert "summary" in result + + summary = result["summary"] + assert summary["total_actions"] == 3 + assert summary["failed_actions"] == 1 + assert summary["total_requests"] == 2 + assert summary["failed_requests"] == 1 + assert summary["total_errors"] == 1 + +@pytest.mark.asyncio +async def test_invalid_trace_file(): + """Test handling of invalid trace files.""" + with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as temp_file: + temp_file.write(b"Invalid zip data") + + with pytest.raises(ValueError, match="Invalid trace file format"): + await PlaywrightTrace.parse(temp_file.name) + + Path(temp_file.name).unlink() + +@pytest.mark.asyncio +async def test_missing_trace_file(): + """Test handling of missing trace files.""" + with pytest.raises(FileNotFoundError): + await PlaywrightTrace.parse("nonexistent_file.zip") + +@pytest.mark.asyncio +async def test_malformed_trace_data(sample_trace_file): + """Test handling of malformed trace data.""" + with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as temp_zip: + with zipfile.ZipFile(temp_zip.name, 'w') as zf: + zf.writestr('trace.trace', 'Invalid JSON data\n{"type": "console", "text": "Valid event"}') + + trace = await PlaywrightTrace.parse(temp_zip.name) + assert len(trace.errors) == 1 # One error for the invalid JSON + assert len(trace.console_logs) == 1 # One valid console event + + Path(temp_zip.name).unlink() \ No newline at end of file diff --git a/webui.py b/webui.py index b7acffe4..ca96dfc7 100644 --- a/webui.py +++ b/webui.py @@ -7,15 +7,29 @@ import pdb import logging - -from dotenv import load_dotenv - -load_dotenv() import os +import sys import glob import asyncio import argparse import os +import warnings + +from dotenv import load_dotenv +from src.utils.logging import setup_logging, PRODUCTION_EXCLUDE_PATTERNS + +# Filter out the specific deprecation warning from langchain-google-genai +warnings.filterwarnings('ignore', message='Convert_system_message_to_human will be deprecated!') + +load_dotenv() + +# Setup logging before importing other modules +setup_logging( + level=os.getenv("LOG_LEVEL", "INFO"), + use_json=os.getenv("LOG_JSON", "true").lower() == "true", + log_file=os.getenv("LOG_FILE"), + exclude_patterns=PRODUCTION_EXCLUDE_PATTERNS if os.getenv("ENVIRONMENT") == "production" else None +) logger = logging.getLogger(__name__)