Captcha Solving
A step-by-step guide to connecting Steel with Browser-use and solving captchas.
This guide walks you through connecting a Steel cloud browser session with the browser-use framework, enabling an AI agent to interact with websites.
Prerequisites
Ensure you have the following:
-
Python 3.11 or higher
-
Steel API key (sign up at app.steel.dev)
-
OpenAI API key (sign up at platform.openai.com)
Step 1: Set up your environment
First, create a project directory, set up a virtual environment, and install the required packages:
# Create a project directorymkdir steel-browser-use-agentcd steel-browser-use-agent# Recommended: Create and activate a virtual environmentuv venvsource .venv/bin/activate # On Windows, use: .venv\Scripts\activate# Install required packagespip install steel-sdk browser-use python-dotenv
Create a .env
file with your API keys:
1STEEL_API_KEY=your_steel_api_key_here2OPENAI_API_KEY=your_openai_api_key_here3TASK=Go to Wikipedia and search for machine learning
Step 2: Create a Steel browser session and initialize Tools and Session Cache
Use the Steel SDK to start a new browser session for your agent:
1import os2from steel import Steel3from dotenv import load_dotenv45# Load environment variables6load_dotenv()7STEEL_API_KEY = os.getenv("STEEL_API_KEY") or "your-steel-api-key-here"89# Validate API key10if STEEL_API_KEY == "your-steel-api-key-here":11print("⚠️ WARNING: Please replace with your actual Steel API key")12print(" Get your API key at: https://app.steel.dev/settings/api-keys")13return1415# Create a Steel browser session and initialize Tools and Session Cache16tools = Tools()1718client = Steel(steel_api_key=STEEL_API_KEY)1920SESSION_CACHE: Dict[str, Any] = {}2122session = client.sessions.create()2324print("✅ Steel browser session started!")25print(f"View live session at: {session.session_viewer_url}")
This creates a new browser session in Steel's cloud. The session_viewer_url allows you to watch your agent's actions in real-time.
Step 3: Define the Captcha Solving tools available to the Agent
1def _has_active_captcha(states: List[Dict[str, Any]]) -> bool:2for state in states:3if bool(state.get("isSolvingCaptcha")):4return True5return False678def _summarize_states(states: List[Dict[str, Any]]) -> Dict[str, Any]:9summary: Dict[str, Any] = {10"pages": [],11"active_pages": 0,12"total_tasks": 0,13"solving_tasks": 0,14"solved_tasks": 0,15"failed_tasks": 0,16}1718for state in states:19tasks = state.get("tasks", []) or []20solving = sum(1 for t in tasks if t.get("status") == "solving")21solved = sum(1 for t in tasks if t.get("status") == "solved")22failed = sum(23124for t in tasks25if t.get("status") in ("failed_to_detect", "failed_to_solve")26)2728summary["pages"].append(29{30"pageId": state.get("pageId"),31"url": state.get("url"),32"isSolvingCaptcha": bool(state.get("isSolvingCaptcha")),33"taskCounts": {34"total": len(tasks),35"solving": solving,36"solved": solved,37"failed": failed,38},39}40)41summary["active_pages"] += 1 if bool(state.get("isSolvingCaptcha")) else 042summary["total_tasks"] += len(tasks)43summary["solving_tasks"] += solving44summary["solved_tasks"] += solved45summary["failed_tasks"] += failed4647return summary484950@tools.action(51description=(52"You need to invoke this tool when you encounter a CAPTCHA. It will get a human to solve the CAPTCHA and wait until the CAPTCHA is solved."53)54)55def wait_for_captcha_solution() -> Dict[str, Any]:56session_id = SESSION_CACHE.get("session_id")57timeout_ms = 6000058poll_interval_ms = 10005960start = time.monotonic()61end_deadline = start + (timeout_ms / 1000.0)62last_states: List[Dict[str, Any]] = []6364while True:65now = time.monotonic()66if now > end_deadline:67duration_ms = int((now - start) * 1000)68return {69"success": False,70"message": "Timeout waiting for CAPTCHAs to be solved",71"duration_ms": duration_ms,72"last_status": _summarize_states(last_states) if last_states else {},73}74try:75# Convert CapchaStatusResponseItems to dict76last_states = [77state.to_dict() for state in client.sessions.captchas.status(session_id)78]7980except Exception:81duration_ms = int((time.monotonic() - start) * 1000)82print(83{84"success": False,85"message": "Failed to get CAPTCHA status; please try again",86"duration_ms": duration_ms,87"last_status": {},88}89)90return "Failed to get CAPTCHA status; please try again"9192if not last_states:93duration_ms = int((time.monotonic() - start) * 1000)94print(95{96"success": True,97"message": "No active CAPTCHAs",98"duration_ms": duration_ms,99"last_status": {},100}101)102return "No active CAPTCHAs"103104if not _has_active_captcha(last_states):105duration_ms = int((time.monotonic() - start) * 1000)106print(107{108"success": True,109"message": "All CAPTCHAs solved",110"duration_ms": duration_ms,111"last_status": _summarize_states(last_states),112}113)114return "All CAPTCHAs solved"115116time.sleep(poll_interval_ms / 1000.0)
Step 4: Define Your Browser Session
Connect the browser-use BrowserSession class to your Steel session using the CDP URL:
1from browser_use import Agent, BrowserSession23# Connect browser-use to the Steel session4cdp_url = f"wss://connect.steel.dev?apiKey={STEEL_API_KEY}&sessionId={session.id}"5browser_session = BrowserSession(cdp_url=cdp_url)
Step 5: Define your AI Agent
Here we bring it all together by defining our agent with what browser, browser context, task, and LLM to use.
1# After setting up the browser session2from browser_use import Agent3from browser_use.llm import ChatOpenAI45# Create a ChatOpenAI model for agent reasoning6model = ChatOpenAI(7model="gpt-4o",8temperature=0.3,9api_key=os.getenv('OPENAI_API_KEY')10)1112# Define the task for the agent13task = os.getenv("TASK") or "Go to Wikipedia and search for machine learning"1415# Create the agent with the task, model, browser session, and tools16agent = Agent(17task=task,18llm=model,19browser_session=browser_session,20tools=tools,21)
This configures the AI agent with:
-
An OpenAI model for reasoning
-
The browser session instance from Step 3
-
A specific task to perform
Models: This example uses GPT-4o, but you can use any browser-use compatible models like Anthropic, DeepSeek, or Gemini. See the full list of supported models here.
Step 6: Run your Agent
1import time23# Define the main function with the agent execution4async def main():5try:6start_time = time.time()78print(f"🎯 Executing task: {task}")9print("=" * 60)1011# Run the agent12result = await agent.run()1314duration = f"{(time.time() - start_time):.1f}"1516print("\n" + "=" * 60)17print("🎉 TASK EXECUTION COMPLETED")18print("=" * 60)19print(f"⏱️ Duration: {duration} seconds")20print(f"🎯 Task: {task}")21if result:22print(f"📋 Result:\n{result}")23print("=" * 60)2425except Exception as e:26print(f"❌ Task execution failed: {e}")27finally:28# Clean up resources29if session:30print("Releasing Steel session...")31client.sessions.release(session.id)32print(f"Session completed. View replay at {session.session_viewer_url}")33print("Done!")3435# Run the async main function36if __name__ == '__main__':37asyncio.run(main())
The agent will spin up a steel browser session and interact with it to complete the task. After completion, it's important to properly close the browser and release the Steel session.
Complete example
Here's the complete script that puts all steps together:
1"""2AI-powered browser automation using browser-use library with Steel browsers.3https://github.com/steel-dev/steel-cookbook/tree/main/examples/steel-browser-use-starter4"""56import os7import time8import asyncio9from dotenv import load_dotenv10from steel import Steel11from browser_use import Agent, BrowserSession12from browser_use.llm import ChatOpenAI1314load_dotenv()1516# Replace with your own API keys17STEEL_API_KEY = os.getenv("STEEL_API_KEY") or "your-steel-api-key-here"18OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or "your-openai-api-key-here"1920# Replace with your own task21TASK = os.getenv("TASK") or "Go to Wikipedia and search for machine learning"2223tools = Tools()2425client = Steel(steel_api_key=STEEL_API_KEY)2627SESSION_CACHE: Dict[str, Any] = {}2829def _has_active_captcha(states: List[Dict[str, Any]]) -> bool:30for state in states:31if bool(state.get("isSolvingCaptcha")):32return True33return False343536def _summarize_states(states: List[Dict[str, Any]]) -> Dict[str, Any]:37summary: Dict[str, Any] = {38"pages": [],39"active_pages": 0,40"total_tasks": 0,41"solving_tasks": 0,42"solved_tasks": 0,43"failed_tasks": 0,44}4546for state in states:47tasks = state.get("tasks", []) or []48solving = sum(1 for t in tasks if t.get("status") == "solving")49solved = sum(1 for t in tasks if t.get("status") == "solved")50failed = sum(51152for t in tasks53if t.get("status") in ("failed_to_detect", "failed_to_solve")54)5556summary["pages"].append(57{58"pageId": state.get("pageId"),59"url": state.get("url"),60"isSolvingCaptcha": bool(state.get("isSolvingCaptcha")),61"taskCounts": {62"total": len(tasks),63"solving": solving,64"solved": solved,65"failed": failed,66},67}68)69summary["active_pages"] += 1 if bool(state.get("isSolvingCaptcha")) else 070summary["total_tasks"] += len(tasks)71summary["solving_tasks"] += solving72summary["solved_tasks"] += solved73summary["failed_tasks"] += failed7475return summary767778@tools.action(79description=(80"You need to invoke this tool when you encounter a CAPTCHA. It will get a human to solve the CAPTCHA and wait until the CAPTCHA is solved."81)82)83def wait_for_captcha_solution() -> Dict[str, Any]:84session_id = SESSION_CACHE.get("session_id")85timeout_ms = 6000086poll_interval_ms = 10008788start = time.monotonic()89end_deadline = start + (timeout_ms / 1000.0)90last_states: List[Dict[str, Any]] = []9192while True:93now = time.monotonic()94if now > end_deadline:95duration_ms = int((now - start) * 1000)96return {97"success": False,98"message": "Timeout waiting for CAPTCHAs to be solved",99"duration_ms": duration_ms,100"last_status": _summarize_states(last_states) if last_states else {},101}102try:103# Convert CapchaStatusResponseItems to dict104last_states = [105state.to_dict() for state in client.sessions.captchas.status(session_id)106]107108except Exception:109duration_ms = int((time.monotonic() - start) * 1000)110print(111{112"success": False,113"message": "Failed to get CAPTCHA status; please try again",114"duration_ms": duration_ms,115"last_status": {},116}117)118return "Failed to get CAPTCHA status; please try again"119120if not last_states:121duration_ms = int((time.monotonic() - start) * 1000)122print(123{124"success": True,125"message": "No active CAPTCHAs",126"duration_ms": duration_ms,127"last_status": {},128}129)130return "No active CAPTCHAs"131132if not _has_active_captcha(last_states):133duration_ms = int((time.monotonic() - start) * 1000)134print(135{136"success": True,137"message": "All CAPTCHAs solved",138"duration_ms": duration_ms,139"last_status": _summarize_states(last_states),140}141)142return "All CAPTCHAs solved"143144time.sleep(poll_interval_ms / 1000.0)145146147148async def main():149print("🚀 Steel + Browser Use Assistant")150print("=" * 60)151152if STEEL_API_KEY == "your-steel-api-key-here":153print("⚠️ WARNING: Please replace 'your-steel-api-key-here' with your actual Steel API key")154print(" Get your API key at: https://app.steel.dev/settings/api-keys")155return156157if OPENAI_API_KEY == "your-openai-api-key-here":158print("⚠️ WARNING: Please replace 'your-openai-api-key-here' with your actual OpenAI API key")159print(" Get your API key at: https://platform.openai.com/api-keys")160return161162print("\nStarting Steel browser session...")163164try:165session = client.sessions.create()166print("✅ Steel browser session started!")167print(f"View live session at: {session.session_viewer_url}")168169print(170f"\033[1;93mSteel Session created!\033[0m\n"171f"View session at \033[1;37m{session.session_viewer_url}\033[0m\n"172)173174cdp_url = f"wss://connect.steel.dev?apiKey={STEEL_API_KEY}&sessionId={session.id}"175176model = ChatOpenAI(model="gpt-4o", temperature=0.3, api_key=OPENAI_API_KEY)177agent = Agent(task=TASK, llm=model, browser_session=BrowserSession(cdp_url=cdp_url), tools=tools)178179start_time = time.time()180181print(f"🎯 Executing task: {TASK}")182print("=" * 60)183184try:185result = await agent.run()186187duration = f"{(time.time() - start_time):.1f}"188189print("\n" + "=" * 60)190print("🎉 TASK EXECUTION COMPLETED")191print("=" * 60)192print(f"⏱️ Duration: {duration} seconds")193print(f"🎯 Task: {TASK}")194if result:195print(f"📋 Result:\n{result}")196print("=" * 60)197198except Exception as e:199print(f"❌ Task execution failed: {e}")200finally:201if session:202print("Releasing Steel session...")203client.sessions.release(session.id)204print(f"Session completed. View replay at {session.session_viewer_url}")205print("Done!")206207except Exception as e:208print(f"❌ Failed to start Steel browser: {e}")209print("Please check your STEEL_API_KEY and internet connection.")210211212if __name__ == "__main__":213asyncio.run(main())
Save this as main.py and run it with:
Customizing your agent's task
Try modifying the task to make your agent perform different actions:
1TASK="""21. Go to https://recaptcha-demo.appspot.com/recaptcha-v2-checkbox.php32. If you see a CAPTCHA box, use the wait_for_captcha_solution tool to solve it43. Once the CAPTCHA is solved, submit the form54. Return the result6"""
Congratulations! You've successfully connected a Steel browser session with browser-use to solve a CAPTCHA.