Captcha Solving

This guide walks you through connecting a Steel cloud browser session with the browser-use framework, enabling an AI agent to interact with websites.

Prerequisites

Ensure you have the following:

Python 3.11 or higher
Steel API key (sign up at app.steel.dev)
OpenAI API key (sign up at platform.openai.com)

Step 1: Set up your environment

First, create a project directory, set up a virtual environment, and install the required packages:

Terminal

# Create a project directory
mkdir steel-browser-use-agent
cd steel-browser-use-agent

# Recommended: Create and activate a virtual environment
uv venv
source .venv/bin/activate  # On Windows, use: .venv\Scripts\activate

# Install required packages
pip install steel-sdk browser-use python-dotenv

Create a .env file with your API keys:

ENV

.env

1STEEL_API_KEY=your_steel_api_key_here
2OPENAI_API_KEY=your_openai_api_key_here
3TASK=Go to Wikipedia and search for machine learning

Step 2: Create a Steel browser session and initialize Tools and Session Cache

Use the Steel SDK to start a new browser session for your agent:

Python

main.py

1import os
2from steel import Steel
3from dotenv import load_dotenv
4
5# Load environment variables
6load_dotenv()
7STEEL_API_KEY = os.getenv("STEEL_API_KEY") or "your-steel-api-key-here"
8
9# Validate API key
10if STEEL_API_KEY == "your-steel-api-key-here":
11    print("⚠️  WARNING: Please replace with your actual Steel API key")
12    print("   Get your API key at: https://app.steel.dev/settings/api-keys")
13    return
14
15# Create a Steel browser session and initialize Tools and Session Cache
16tools = Tools()
17
18client = Steel(steel_api_key=STEEL_API_KEY)
19
20SESSION_CACHE: Dict[str, Any] = {}
21
22session = client.sessions.create()
23
24print("✅ Steel browser session started!")
25print(f"View live session at: {session.session_viewer_url}")

This creates a new browser session in Steel's cloud. The session_viewer_url allows you to watch your agent's actions in real-time.

Step 3: Define the Captcha Solving tools available to the Agent

Python

main.py

1def _has_active_captcha(states: List[Dict[str, Any]]) -> bool:
2    for state in states:
3        if bool(state.get("isSolvingCaptcha")):
4            return True
5    return False
6
7
8def _summarize_states(states: List[Dict[str, Any]]) -> Dict[str, Any]:
9    summary: Dict[str, Any] = {
10        "pages": [],
11        "active_pages": 0,
12        "total_tasks": 0,
13        "solving_tasks": 0,
14        "solved_tasks": 0,
15        "failed_tasks": 0,
16    }
17
18    for state in states:
19        tasks = state.get("tasks", []) or []
20        solving = sum(1 for t in tasks if t.get("status") == "solving")
21        solved = sum(1 for t in tasks if t.get("status") == "solved")
22        failed = sum(
23            1
24            for t in tasks
25            if t.get("status") in ("failed_to_detect", "failed_to_solve")
26        )
27
28        summary["pages"].append(
29            {
30                "pageId": state.get("pageId"),
31                "url": state.get("url"),
32                "isSolvingCaptcha": bool(state.get("isSolvingCaptcha")),
33                "taskCounts": {
34                    "total": len(tasks),
35                    "solving": solving,
36                    "solved": solved,
37                    "failed": failed,
38                },
39            }
40        )
41        summary["active_pages"] += 1 if bool(state.get("isSolvingCaptcha")) else 0
42        summary["total_tasks"] += len(tasks)
43        summary["solving_tasks"] += solving
44        summary["solved_tasks"] += solved
45        summary["failed_tasks"] += failed
46
47    return summary
48
49
50@tools.action(
51    description=(
52        "You need to invoke this tool when you encounter a CAPTCHA. It will get a human to solve the CAPTCHA and wait until the CAPTCHA is solved."
53    )
54)
55def wait_for_captcha_solution() -> Dict[str, Any]:
56    session_id = SESSION_CACHE.get("session_id")
57    timeout_ms = 60000
58    poll_interval_ms = 1000
59
60    start = time.monotonic()
61    end_deadline = start + (timeout_ms / 1000.0)
62    last_states: List[Dict[str, Any]] = []
63
64    while True:
65        now = time.monotonic()
66        if now > end_deadline:
67            duration_ms = int((now - start) * 1000)
68            return {
69                "success": False,
70                "message": "Timeout waiting for CAPTCHAs to be solved",
71                "duration_ms": duration_ms,
72                "last_status": _summarize_states(last_states) if last_states else {},
73            }
74        try:
75            # Convert CapchaStatusResponseItems to dict
76            last_states = [
77                state.to_dict() for state in client.sessions.captchas.status(session_id)
78            ]
79
80        except Exception:
81            duration_ms = int((time.monotonic() - start) * 1000)
82            print(
83                {
84                    "success": False,
85                    "message": "Failed to get CAPTCHA status; please try again",
86                    "duration_ms": duration_ms,
87                    "last_status": {},
88                }
89            )
90            return "Failed to get CAPTCHA status; please try again"
91
92        if not last_states:
93            duration_ms = int((time.monotonic() - start) * 1000)
94            print(
95                {
96                    "success": True,
97                    "message": "No active CAPTCHAs",
98                    "duration_ms": duration_ms,
99                    "last_status": {},
100                }
101            )
102            return "No active CAPTCHAs"
103
104        if not _has_active_captcha(last_states):
105            duration_ms = int((time.monotonic() - start) * 1000)
106            print(
107                {
108                    "success": True,
109                    "message": "All CAPTCHAs solved",
110                    "duration_ms": duration_ms,
111                    "last_status": _summarize_states(last_states),
112                }
113            )
114            return "All CAPTCHAs solved"
115
116        time.sleep(poll_interval_ms / 1000.0)

Step 4: Define Your Browser Session

Connect the browser-use BrowserSession class to your Steel session using the CDP URL:

Python

main.py

1from browser_use import Agent, BrowserSession
2
3# Connect browser-use to the Steel session
4cdp_url = f"wss://connect.steel.dev?apiKey={STEEL_API_KEY}&sessionId={session.id}"
5browser_session = BrowserSession(cdp_url=cdp_url)

Step 5: Define your AI Agent

Here we bring it all together by defining our agent with what browser, browser context, task, and LLM to use.

Python

main.py

1# After setting up the browser session
2from browser_use import Agent
3from browser_use.llm import ChatOpenAI
4
5# Create a ChatOpenAI model for agent reasoning
6model = ChatOpenAI(
7    model="gpt-4o",
8    temperature=0.3,
9    api_key=os.getenv('OPENAI_API_KEY')
10)
11
12# Define the task for the agent
13task = os.getenv("TASK") or "Go to Wikipedia and search for machine learning"
14
15# Create the agent with the task, model, browser session, and tools
16agent = Agent(
17    task=task,
18    llm=model,
19    browser_session=browser_session,
20    tools=tools,
21)

This configures the AI agent with:

An OpenAI model for reasoning
The browser session instance from Step 3
A specific task to perform

Models: This example uses GPT-4o, but you can use any browser-use compatible models like Anthropic, DeepSeek, or Gemini. See the full list of supported models here.

Step 6: Run your Agent

Python

main.py

1import time
2
3# Define the main function with the agent execution
4async def main():
5    try:
6        start_time = time.time()
7
8        print(f"🎯 Executing task: {task}")
9        print("=" * 60)
10
11        # Run the agent
12        result = await agent.run()
13
14        duration = f"{(time.time() - start_time):.1f}"
15
16        print("\n" + "=" * 60)
17        print("🎉 TASK EXECUTION COMPLETED")
18        print("=" * 60)
19        print(f"⏱️  Duration: {duration} seconds")
20        print(f"🎯 Task: {task}")
21        if result:
22            print(f"📋 Result:\n{result}")
23        print("=" * 60)
24
25    except Exception as e:
26        print(f"❌ Task execution failed: {e}")
27    finally:
28        # Clean up resources
29        if session:
30            print("Releasing Steel session...")
31            client.sessions.release(session.id)
32            print(f"Session completed. View replay at {session.session_viewer_url}")
33        print("Done!")
34
35# Run the async main function
36if __name__ == '__main__':
37    asyncio.run(main())

The agent will spin up a steel browser session and interact with it to complete the task. After completion, it's important to properly close the browser and release the Steel session.

Complete example

Here's the complete script that puts all steps together:

Python

main.py

1"""
2AI-powered browser automation using browser-use library with Steel browsers.
3https://github.com/steel-dev/steel-cookbook/tree/main/examples/steel-browser-use-starter
4"""
5
6import os
7import time
8import asyncio
9from dotenv import load_dotenv
10from steel import Steel
11from browser_use import Agent, BrowserSession
12from browser_use.llm import ChatOpenAI
13
14load_dotenv()
15
16# Replace with your own API keys
17STEEL_API_KEY = os.getenv("STEEL_API_KEY") or "your-steel-api-key-here"
18OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or "your-openai-api-key-here"
19
20# Replace with your own task
21TASK = os.getenv("TASK") or "Go to Wikipedia and search for machine learning"
22
23tools = Tools()
24
25client = Steel(steel_api_key=STEEL_API_KEY)
26
27SESSION_CACHE: Dict[str, Any] = {}
28
29def _has_active_captcha(states: List[Dict[str, Any]]) -> bool:
30    for state in states:
31        if bool(state.get("isSolvingCaptcha")):
32            return True
33    return False
34
35
36def _summarize_states(states: List[Dict[str, Any]]) -> Dict[str, Any]:
37    summary: Dict[str, Any] = {
38        "pages": [],
39        "active_pages": 0,
40        "total_tasks": 0,
41        "solving_tasks": 0,
42        "solved_tasks": 0,
43        "failed_tasks": 0,
44    }
45
46    for state in states:
47        tasks = state.get("tasks", []) or []
48        solving = sum(1 for t in tasks if t.get("status") == "solving")
49        solved = sum(1 for t in tasks if t.get("status") == "solved")
50        failed = sum(
51            1
52            for t in tasks
53            if t.get("status") in ("failed_to_detect", "failed_to_solve")
54        )
55
56        summary["pages"].append(
57            {
58                "pageId": state.get("pageId"),
59                "url": state.get("url"),
60                "isSolvingCaptcha": bool(state.get("isSolvingCaptcha")),
61                "taskCounts": {
62                    "total": len(tasks),
63                    "solving": solving,
64                    "solved": solved,
65                    "failed": failed,
66                },
67            }
68        )
69        summary["active_pages"] += 1 if bool(state.get("isSolvingCaptcha")) else 0
70        summary["total_tasks"] += len(tasks)
71        summary["solving_tasks"] += solving
72        summary["solved_tasks"] += solved
73        summary["failed_tasks"] += failed
74
75    return summary
76
77
78@tools.action(
79    description=(
80        "You need to invoke this tool when you encounter a CAPTCHA. It will get a human to solve the CAPTCHA and wait until the CAPTCHA is solved."
81    )
82)
83def wait_for_captcha_solution() -> Dict[str, Any]:
84    session_id = SESSION_CACHE.get("session_id")
85    timeout_ms = 60000
86    poll_interval_ms = 1000
87
88    start = time.monotonic()
89    end_deadline = start + (timeout_ms / 1000.0)
90    last_states: List[Dict[str, Any]] = []
91
92    while True:
93        now = time.monotonic()
94        if now > end_deadline:
95            duration_ms = int((now - start) * 1000)
96            return {
97                "success": False,
98                "message": "Timeout waiting for CAPTCHAs to be solved",
99                "duration_ms": duration_ms,
100                "last_status": _summarize_states(last_states) if last_states else {},
101            }
102        try:
103            # Convert CapchaStatusResponseItems to dict
104            last_states = [
105                state.to_dict() for state in client.sessions.captchas.status(session_id)
106            ]
107
108        except Exception:
109            duration_ms = int((time.monotonic() - start) * 1000)
110            print(
111                {
112                    "success": False,
113                    "message": "Failed to get CAPTCHA status; please try again",
114                    "duration_ms": duration_ms,
115                    "last_status": {},
116                }
117            )
118            return "Failed to get CAPTCHA status; please try again"
119
120        if not last_states:
121            duration_ms = int((time.monotonic() - start) * 1000)
122            print(
123                {
124                    "success": True,
125                    "message": "No active CAPTCHAs",
126                    "duration_ms": duration_ms,
127                    "last_status": {},
128                }
129            )
130            return "No active CAPTCHAs"
131
132        if not _has_active_captcha(last_states):
133            duration_ms = int((time.monotonic() - start) * 1000)
134            print(
135                {
136                    "success": True,
137                    "message": "All CAPTCHAs solved",
138                    "duration_ms": duration_ms,
139                    "last_status": _summarize_states(last_states),
140                }
141            )
142            return "All CAPTCHAs solved"
143
144        time.sleep(poll_interval_ms / 1000.0)
145
146
147
148async def main():
149    print("🚀 Steel + Browser Use Assistant")
150    print("=" * 60)
151
152    if STEEL_API_KEY == "your-steel-api-key-here":
153        print("⚠️  WARNING: Please replace 'your-steel-api-key-here' with your actual Steel API key")
154        print("   Get your API key at: https://app.steel.dev/settings/api-keys")
155        return
156
157    if OPENAI_API_KEY == "your-openai-api-key-here":
158        print("⚠️  WARNING: Please replace 'your-openai-api-key-here' with your actual OpenAI API key")
159        print("   Get your API key at: https://platform.openai.com/api-keys")
160        return
161
162    print("\nStarting Steel browser session...")
163
164    try:
165        session = client.sessions.create()
166        print("✅ Steel browser session started!")
167        print(f"View live session at: {session.session_viewer_url}")
168
169        print(
170            f"\033[1;93mSteel Session created!\033[0m\n"
171            f"View session at \033[1;37m{session.session_viewer_url}\033[0m\n"
172        )
173
174        cdp_url = f"wss://connect.steel.dev?apiKey={STEEL_API_KEY}&sessionId={session.id}"
175
176        model = ChatOpenAI(model="gpt-4o", temperature=0.3, api_key=OPENAI_API_KEY)
177        agent = Agent(task=TASK, llm=model, browser_session=BrowserSession(cdp_url=cdp_url), tools=tools)
178
179        start_time = time.time()
180
181        print(f"🎯 Executing task: {TASK}")
182        print("=" * 60)
183
184        try:
185            result = await agent.run()
186
187            duration = f"{(time.time() - start_time):.1f}"
188
189            print("\n" + "=" * 60)
190            print("🎉 TASK EXECUTION COMPLETED")
191            print("=" * 60)
192            print(f"⏱️  Duration: {duration} seconds")
193            print(f"🎯 Task: {TASK}")
194            if result:
195                print(f"📋 Result:\n{result}")
196            print("=" * 60)
197
198        except Exception as e:
199            print(f"❌ Task execution failed: {e}")
200        finally:
201            if session:
202                print("Releasing Steel session...")
203                client.sessions.release(session.id)
204                print(f"Session completed. View replay at {session.session_viewer_url}")
205            print("Done!")
206
207    except Exception as e:
208        print(f"❌ Failed to start Steel browser: {e}")
209        print("Please check your STEEL_API_KEY and internet connection.")
210
211
212if __name__ == "__main__":
213    asyncio.run(main())

Save this as main.py and run it with:

Customizing your agent's task

Try modifying the task to make your agent perform different actions:

Python

main.py

1TASK="""
21. Go to https://recaptcha-demo.appspot.com/recaptcha-v2-checkbox.php
32. If you see a CAPTCHA box, use the wait_for_captcha_solution tool to solve it
43. Once the CAPTCHA is solved, submit the form
54. Return the result
6"""

Congratulations! You've successfully connected a Steel browser session with browser-use to solve a CAPTCHA.