Captcha Solving

A step-by-step guide to connecting Steel with Browser-use and solving captchas.

This guide walks you through connecting a Steel cloud browser session with the browser-use framework, enabling an AI agent to interact with websites.

Prerequisites

Ensure you have the following:

Step 1: Set up your environment

First, create a project directory, set up a virtual environment, and install the required packages:

Terminal
# Create a project directory
mkdir steel-browser-use-agent
cd steel-browser-use-agent
# Recommended: Create and activate a virtual environment
uv venv
source .venv/bin/activate # On Windows, use: .venv\Scripts\activate
# Install required packages
pip install steel-sdk browser-use python-dotenv

Create a .env file with your API keys:

ENV
.env
1
STEEL_API_KEY=your_steel_api_key_here
2
OPENAI_API_KEY=your_openai_api_key_here
3
TASK=Go to Wikipedia and search for machine learning

Step 2: Create a Steel browser session and initialize Tools and Session Cache

Use the Steel SDK to start a new browser session for your agent:

Python
main.py
1
import os
2
from steel import Steel
3
from dotenv import load_dotenv
4
5
# Load environment variables
6
load_dotenv()
7
STEEL_API_KEY = os.getenv("STEEL_API_KEY") or "your-steel-api-key-here"
8
9
# Validate API key
10
if STEEL_API_KEY == "your-steel-api-key-here":
11
print("⚠️ WARNING: Please replace with your actual Steel API key")
12
print(" Get your API key at: https://app.steel.dev/settings/api-keys")
13
return
14
15
# Create a Steel browser session and initialize Tools and Session Cache
16
tools = Tools()
17
18
client = Steel(steel_api_key=STEEL_API_KEY)
19
20
SESSION_CACHE: Dict[str, Any] = {}
21
22
session = client.sessions.create()
23
24
print("✅ Steel browser session started!")
25
print(f"View live session at: {session.session_viewer_url}")

This creates a new browser session in Steel's cloud. The session_viewer_url allows you to watch your agent's actions in real-time.

Step 3: Define the Captcha Solving tools available to the Agent

Python
main.py
1
def _has_active_captcha(states: List[Dict[str, Any]]) -> bool:
2
for state in states:
3
if bool(state.get("isSolvingCaptcha")):
4
return True
5
return False
6
7
8
def _summarize_states(states: List[Dict[str, Any]]) -> Dict[str, Any]:
9
summary: Dict[str, Any] = {
10
"pages": [],
11
"active_pages": 0,
12
"total_tasks": 0,
13
"solving_tasks": 0,
14
"solved_tasks": 0,
15
"failed_tasks": 0,
16
}
17
18
for state in states:
19
tasks = state.get("tasks", []) or []
20
solving = sum(1 for t in tasks if t.get("status") == "solving")
21
solved = sum(1 for t in tasks if t.get("status") == "solved")
22
failed = sum(
23
1
24
for t in tasks
25
if t.get("status") in ("failed_to_detect", "failed_to_solve")
26
)
27
28
summary["pages"].append(
29
{
30
"pageId": state.get("pageId"),
31
"url": state.get("url"),
32
"isSolvingCaptcha": bool(state.get("isSolvingCaptcha")),
33
"taskCounts": {
34
"total": len(tasks),
35
"solving": solving,
36
"solved": solved,
37
"failed": failed,
38
},
39
}
40
)
41
summary["active_pages"] += 1 if bool(state.get("isSolvingCaptcha")) else 0
42
summary["total_tasks"] += len(tasks)
43
summary["solving_tasks"] += solving
44
summary["solved_tasks"] += solved
45
summary["failed_tasks"] += failed
46
47
return summary
48
49
50
@tools.action(
51
description=(
52
"You need to invoke this tool when you encounter a CAPTCHA. It will get a human to solve the CAPTCHA and wait until the CAPTCHA is solved."
53
)
54
)
55
def wait_for_captcha_solution() -> Dict[str, Any]:
56
session_id = SESSION_CACHE.get("session_id")
57
timeout_ms = 60000
58
poll_interval_ms = 1000
59
60
start = time.monotonic()
61
end_deadline = start + (timeout_ms / 1000.0)
62
last_states: List[Dict[str, Any]] = []
63
64
while True:
65
now = time.monotonic()
66
if now > end_deadline:
67
duration_ms = int((now - start) * 1000)
68
return {
69
"success": False,
70
"message": "Timeout waiting for CAPTCHAs to be solved",
71
"duration_ms": duration_ms,
72
"last_status": _summarize_states(last_states) if last_states else {},
73
}
74
try:
75
# Convert CapchaStatusResponseItems to dict
76
last_states = [
77
state.to_dict() for state in client.sessions.captchas.status(session_id)
78
]
79
80
except Exception:
81
duration_ms = int((time.monotonic() - start) * 1000)
82
print(
83
{
84
"success": False,
85
"message": "Failed to get CAPTCHA status; please try again",
86
"duration_ms": duration_ms,
87
"last_status": {},
88
}
89
)
90
return "Failed to get CAPTCHA status; please try again"
91
92
if not last_states:
93
duration_ms = int((time.monotonic() - start) * 1000)
94
print(
95
{
96
"success": True,
97
"message": "No active CAPTCHAs",
98
"duration_ms": duration_ms,
99
"last_status": {},
100
}
101
)
102
return "No active CAPTCHAs"
103
104
if not _has_active_captcha(last_states):
105
duration_ms = int((time.monotonic() - start) * 1000)
106
print(
107
{
108
"success": True,
109
"message": "All CAPTCHAs solved",
110
"duration_ms": duration_ms,
111
"last_status": _summarize_states(last_states),
112
}
113
)
114
return "All CAPTCHAs solved"
115
116
time.sleep(poll_interval_ms / 1000.0)

Step 4: Define Your Browser Session

Connect the browser-use BrowserSession class to your Steel session using the CDP URL:

Python
main.py
1
from browser_use import Agent, BrowserSession
2
3
# Connect browser-use to the Steel session
4
cdp_url = f"wss://connect.steel.dev?apiKey={STEEL_API_KEY}&sessionId={session.id}"
5
browser_session = BrowserSession(cdp_url=cdp_url)

Step 5: Define your AI Agent

Here we bring it all together by defining our agent with what browser, browser context, task, and LLM to use.

Python
main.py
1
# After setting up the browser session
2
from browser_use import Agent
3
from browser_use.llm import ChatOpenAI
4
5
# Create a ChatOpenAI model for agent reasoning
6
model = ChatOpenAI(
7
model="gpt-4o",
8
temperature=0.3,
9
api_key=os.getenv('OPENAI_API_KEY')
10
)
11
12
# Define the task for the agent
13
task = os.getenv("TASK") or "Go to Wikipedia and search for machine learning"
14
15
# Create the agent with the task, model, browser session, and tools
16
agent = Agent(
17
task=task,
18
llm=model,
19
browser_session=browser_session,
20
tools=tools,
21
)

This configures the AI agent with:

  • An OpenAI model for reasoning

  • The browser session instance from Step 3

  • A specific task to perform

Models: This example uses GPT-4o, but you can use any browser-use compatible models like Anthropic, DeepSeek, or Gemini. See the full list of supported models here.

Step 6: Run your Agent

Python
main.py
1
import time
2
3
# Define the main function with the agent execution
4
async def main():
5
try:
6
start_time = time.time()
7
8
print(f"🎯 Executing task: {task}")
9
print("=" * 60)
10
11
# Run the agent
12
result = await agent.run()
13
14
duration = f"{(time.time() - start_time):.1f}"
15
16
print("\n" + "=" * 60)
17
print("🎉 TASK EXECUTION COMPLETED")
18
print("=" * 60)
19
print(f"⏱️ Duration: {duration} seconds")
20
print(f"🎯 Task: {task}")
21
if result:
22
print(f"📋 Result:\n{result}")
23
print("=" * 60)
24
25
except Exception as e:
26
print(f"❌ Task execution failed: {e}")
27
finally:
28
# Clean up resources
29
if session:
30
print("Releasing Steel session...")
31
client.sessions.release(session.id)
32
print(f"Session completed. View replay at {session.session_viewer_url}")
33
print("Done!")
34
35
# Run the async main function
36
if __name__ == '__main__':
37
asyncio.run(main())

The agent will spin up a steel browser session and interact with it to complete the task. After completion, it's important to properly close the browser and release the Steel session.

Complete example

Here's the complete script that puts all steps together:

Python
main.py
1
"""
2
AI-powered browser automation using browser-use library with Steel browsers.
3
https://github.com/steel-dev/steel-cookbook/tree/main/examples/steel-browser-use-starter
4
"""
5
6
import os
7
import time
8
import asyncio
9
from dotenv import load_dotenv
10
from steel import Steel
11
from browser_use import Agent, BrowserSession
12
from browser_use.llm import ChatOpenAI
13
14
load_dotenv()
15
16
# Replace with your own API keys
17
STEEL_API_KEY = os.getenv("STEEL_API_KEY") or "your-steel-api-key-here"
18
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or "your-openai-api-key-here"
19
20
# Replace with your own task
21
TASK = os.getenv("TASK") or "Go to Wikipedia and search for machine learning"
22
23
tools = Tools()
24
25
client = Steel(steel_api_key=STEEL_API_KEY)
26
27
SESSION_CACHE: Dict[str, Any] = {}
28
29
def _has_active_captcha(states: List[Dict[str, Any]]) -> bool:
30
for state in states:
31
if bool(state.get("isSolvingCaptcha")):
32
return True
33
return False
34
35
36
def _summarize_states(states: List[Dict[str, Any]]) -> Dict[str, Any]:
37
summary: Dict[str, Any] = {
38
"pages": [],
39
"active_pages": 0,
40
"total_tasks": 0,
41
"solving_tasks": 0,
42
"solved_tasks": 0,
43
"failed_tasks": 0,
44
}
45
46
for state in states:
47
tasks = state.get("tasks", []) or []
48
solving = sum(1 for t in tasks if t.get("status") == "solving")
49
solved = sum(1 for t in tasks if t.get("status") == "solved")
50
failed = sum(
51
1
52
for t in tasks
53
if t.get("status") in ("failed_to_detect", "failed_to_solve")
54
)
55
56
summary["pages"].append(
57
{
58
"pageId": state.get("pageId"),
59
"url": state.get("url"),
60
"isSolvingCaptcha": bool(state.get("isSolvingCaptcha")),
61
"taskCounts": {
62
"total": len(tasks),
63
"solving": solving,
64
"solved": solved,
65
"failed": failed,
66
},
67
}
68
)
69
summary["active_pages"] += 1 if bool(state.get("isSolvingCaptcha")) else 0
70
summary["total_tasks"] += len(tasks)
71
summary["solving_tasks"] += solving
72
summary["solved_tasks"] += solved
73
summary["failed_tasks"] += failed
74
75
return summary
76
77
78
@tools.action(
79
description=(
80
"You need to invoke this tool when you encounter a CAPTCHA. It will get a human to solve the CAPTCHA and wait until the CAPTCHA is solved."
81
)
82
)
83
def wait_for_captcha_solution() -> Dict[str, Any]:
84
session_id = SESSION_CACHE.get("session_id")
85
timeout_ms = 60000
86
poll_interval_ms = 1000
87
88
start = time.monotonic()
89
end_deadline = start + (timeout_ms / 1000.0)
90
last_states: List[Dict[str, Any]] = []
91
92
while True:
93
now = time.monotonic()
94
if now > end_deadline:
95
duration_ms = int((now - start) * 1000)
96
return {
97
"success": False,
98
"message": "Timeout waiting for CAPTCHAs to be solved",
99
"duration_ms": duration_ms,
100
"last_status": _summarize_states(last_states) if last_states else {},
101
}
102
try:
103
# Convert CapchaStatusResponseItems to dict
104
last_states = [
105
state.to_dict() for state in client.sessions.captchas.status(session_id)
106
]
107
108
except Exception:
109
duration_ms = int((time.monotonic() - start) * 1000)
110
print(
111
{
112
"success": False,
113
"message": "Failed to get CAPTCHA status; please try again",
114
"duration_ms": duration_ms,
115
"last_status": {},
116
}
117
)
118
return "Failed to get CAPTCHA status; please try again"
119
120
if not last_states:
121
duration_ms = int((time.monotonic() - start) * 1000)
122
print(
123
{
124
"success": True,
125
"message": "No active CAPTCHAs",
126
"duration_ms": duration_ms,
127
"last_status": {},
128
}
129
)
130
return "No active CAPTCHAs"
131
132
if not _has_active_captcha(last_states):
133
duration_ms = int((time.monotonic() - start) * 1000)
134
print(
135
{
136
"success": True,
137
"message": "All CAPTCHAs solved",
138
"duration_ms": duration_ms,
139
"last_status": _summarize_states(last_states),
140
}
141
)
142
return "All CAPTCHAs solved"
143
144
time.sleep(poll_interval_ms / 1000.0)
145
146
147
148
async def main():
149
print("🚀 Steel + Browser Use Assistant")
150
print("=" * 60)
151
152
if STEEL_API_KEY == "your-steel-api-key-here":
153
print("⚠️ WARNING: Please replace 'your-steel-api-key-here' with your actual Steel API key")
154
print(" Get your API key at: https://app.steel.dev/settings/api-keys")
155
return
156
157
if OPENAI_API_KEY == "your-openai-api-key-here":
158
print("⚠️ WARNING: Please replace 'your-openai-api-key-here' with your actual OpenAI API key")
159
print(" Get your API key at: https://platform.openai.com/api-keys")
160
return
161
162
print("\nStarting Steel browser session...")
163
164
try:
165
session = client.sessions.create()
166
print("✅ Steel browser session started!")
167
print(f"View live session at: {session.session_viewer_url}")
168
169
print(
170
f"\033[1;93mSteel Session created!\033[0m\n"
171
f"View session at \033[1;37m{session.session_viewer_url}\033[0m\n"
172
)
173
174
cdp_url = f"wss://connect.steel.dev?apiKey={STEEL_API_KEY}&sessionId={session.id}"
175
176
model = ChatOpenAI(model="gpt-4o", temperature=0.3, api_key=OPENAI_API_KEY)
177
agent = Agent(task=TASK, llm=model, browser_session=BrowserSession(cdp_url=cdp_url), tools=tools)
178
179
start_time = time.time()
180
181
print(f"🎯 Executing task: {TASK}")
182
print("=" * 60)
183
184
try:
185
result = await agent.run()
186
187
duration = f"{(time.time() - start_time):.1f}"
188
189
print("\n" + "=" * 60)
190
print("🎉 TASK EXECUTION COMPLETED")
191
print("=" * 60)
192
print(f"⏱️ Duration: {duration} seconds")
193
print(f"🎯 Task: {TASK}")
194
if result:
195
print(f"📋 Result:\n{result}")
196
print("=" * 60)
197
198
except Exception as e:
199
print(f"❌ Task execution failed: {e}")
200
finally:
201
if session:
202
print("Releasing Steel session...")
203
client.sessions.release(session.id)
204
print(f"Session completed. View replay at {session.session_viewer_url}")
205
print("Done!")
206
207
except Exception as e:
208
print(f"❌ Failed to start Steel browser: {e}")
209
print("Please check your STEEL_API_KEY and internet connection.")
210
211
212
if __name__ == "__main__":
213
asyncio.run(main())

Save this as main.py and run it with:

Customizing your agent's task

Try modifying the task to make your agent perform different actions:

Python
main.py
1
TASK="""
2
1. Go to https://recaptcha-demo.appspot.com/recaptcha-v2-checkbox.php
3
2. If you see a CAPTCHA box, use the wait_for_captcha_solution tool to solve it
4
3. Once the CAPTCHA is solved, submit the form
5
4. Return the result
6
"""

Congratulations! You've successfully connected a Steel browser session with browser-use to solve a CAPTCHA.