Quickstart (Python)

Build a browser agent with the OpenAI Agents SDK for Python and Steel. The agent opens a Steel session, navigates and snapshots the page, optionally extracts structured rows, and returns a Pydantic-validated final report.

Scroll to the bottom for the full example.

Requirements

  • Steel API key

  • OpenAI API key

  • Python 3.11+

Step 1: Project Setup

Terminal
mkdir steel-openai-agents-py && \
cd steel-openai-agents-py && \
python -m venv .venv && \
source .venv/bin/activate && \
touch main.py .env

Step 2: Install Dependencies

Terminal
$
uv venv
$
source .venv/bin/activate
$
uv add openai-agents steel-sdk playwright pydantic python-dotenv
Terminal
playwright install chromium

Step 3: Environment Variables

ENV
.env
1
STEEL_API_KEY=your-steel-api-key-here
2
OPENAI_API_KEY=your-openai-api-key-here

Step 4: Define Steel tools

Each tool is an async function decorated with @function_tool. The SDK reads the signature and docstring to build the JSON schema automatically. Pydantic models are used where an argument needs structure.

Python
main.py
1
import asyncio
2
import os
3
from typing import Optional
4
5
from agents import Agent, Runner, function_tool
6
from dotenv import load_dotenv
7
from playwright.async_api import Browser, Page, async_playwright
8
from pydantic import BaseModel, Field
9
from steel import Steel
10
11
load_dotenv()
12
13
STEEL_API_KEY = os.getenv("STEEL_API_KEY") or "your-steel-api-key-here"
14
steel = Steel(steel_api_key=STEEL_API_KEY)
15
16
_session = None
17
_browser: Optional[Browser] = None
18
_page: Optional[Page] = None
19
_playwright = None
20
21
22
@function_tool
23
async def open_session() -> dict:
24
"""Open a Steel cloud browser session. Call exactly once, before anything else."""
25
global _session, _browser, _page, _playwright
26
_session = steel.sessions.create()
27
_playwright = await async_playwright().start()
28
_browser = await _playwright.chromium.connect_over_cdp(
29
f"{_session.websocket_url}&apiKey={STEEL_API_KEY}"
30
)
31
ctx = _browser.contexts[0]
32
_page = ctx.pages[0] if ctx.pages else await ctx.new_page()
33
return {"session_id": _session.id, "live_view_url": _session.session_viewer_url}
34
35
36
@function_tool
37
async def navigate(url: str) -> dict:
38
"""Navigate the open session to a URL and wait for the page to load."""
39
if _page is None:
40
raise RuntimeError("open_session first.")
41
await _page.goto(url, wait_until="domcontentloaded", timeout=45_000)
42
return {"url": _page.url, "title": await _page.title()}
43
44
45
@function_tool
46
async def snapshot(max_chars: int = 4_000, max_links: int = 50) -> dict:
47
"""Return a readable snapshot of the current page: title, URL, visible
48
text (capped), and a list of links. Call BEFORE extract so the agent
49
never has to guess CSS selectors.
50
"""
51
if _page is None:
52
raise RuntimeError("open_session first.")
53
return await _page.evaluate(
54
"""({maxChars, maxLinks}) => {
55
const text = (document.body.innerText || '').slice(0, maxChars);
56
const links = Array.from(document.querySelectorAll('a[href]'))
57
.slice(0, maxLinks)
58
.map((a) => ({
59
text: (a.innerText || a.textContent || '').trim().slice(0, 120),
60
href: a.href,
61
}))
62
.filter((l) => l.text && l.href);
63
return { url: location.href, title: document.title, text, links };
64
}""",
65
{"maxChars": max_chars, "maxLinks": max_links},
66
)
67
68
69
class FieldSpec(BaseModel):
70
name: str
71
selector: str = Field(
72
description="CSS selector relative to the row. Empty string reads the row itself."
73
)
74
attr: Optional[str] = Field(
75
default=None,
76
description="Optional attribute to read instead of innerText (e.g. 'href').",
77
)
78
79
80
@function_tool
81
async def extract(
82
row_selector: str, fields: list[FieldSpec], limit: int = 10
83
) -> dict:
84
"""Extract structured rows from the current page using CSS selectors.
85
Prefer calling snapshot() first to confirm the page structure.
86
"""
87
if _page is None:
88
raise RuntimeError("open_session first.")
89
fields_json = [{"name": f.name, "selector": f.selector, "attr": f.attr} for f in fields]
90
items = await _page.evaluate(
91
"""({rowSelector, fields, limit}) => {
92
const rows = Array.from(
93
document.querySelectorAll(rowSelector)
94
).slice(0, limit);
95
return rows.map((row) => {
96
const item = {};
97
for (const f of fields) {
98
const el = f.selector ? row.querySelector(f.selector) : row;
99
if (!el) { item[f.name] = ''; continue; }
100
item[f.name] = f.attr
101
? (el.getAttribute(f.attr) || '').trim()
102
: (el.innerText || el.textContent || '').trim();
103
}
104
return item;
105
});
106
}""",
107
{"rowSelector": row_selector, "fields": fields_json, "limit": limit},
108
)
109
return {"count": len(items), "items": items}
Don't do N×M serial CDP calls

page.query_selector_all + row.query_selector + el.inner_text() look fine locally but each await is a separate CDP round-trip to Steel's cloud browser (~200-300ms each). A 10×4 extract becomes 40 round-trips (8-12 seconds). The page.evaluate version above runs the whole extraction in the browser: one round-trip, <500ms.

Step 5: Build the Agent

Define a Pydantic output_type to get a typed final answer. OpenAI supports output_type + tools together, unlike some providers that force JSON-only mode when you ask for structured output.

Python
main.py
1
class Repo(BaseModel):
2
name: str
3
url: str
4
stars: Optional[str] = None
5
description: Optional[str] = None
6
7
8
class FinalReport(BaseModel):
9
summary: str = Field(
10
description="One-paragraph summary of what these repos have in common."
11
)
12
repos: list[Repo] = Field(min_length=1, max_length=5)
13
14
15
agent = Agent(
16
name="SteelResearch",
17
instructions=(
18
"You operate a Steel cloud browser via tools. "
19
"Workflow: (1) open_session, (2) navigate to the target URL, "
20
"(3) snapshot to see the page's text and links, "
21
"(4) only call extract when you need structured rows beyond snapshot, "
22
"(5) return the final FinalReport. "
23
"Prefer snapshot's links list over guessing selectors. Do not invent data."
24
),
25
model="gpt-5-mini",
26
tools=[open_session, navigate, snapshot, extract],
27
output_type=FinalReport,
28
)

Step 6: Run and clean up

Python
main.py
1
async def main() -> None:
2
try:
3
result = await Runner.run(
4
agent,
5
input=(
6
"Go to https://github.com/trending/python?since=daily and return the "
7
"top 3 AI/ML-related repositories. For each, give name (owner/repo), "
8
"GitHub URL, star count as shown, and the repo description."
9
),
10
max_turns=15,
11
)
12
final: FinalReport = result.final_output
13
print(final.model_dump_json(indent=2))
14
finally:
15
if _browser is not None:
16
await _browser.close()
17
if _playwright is not None:
18
await _playwright.stop()
19
if _session is not None:
20
steel.sessions.release(_session.id)
21
22
23
if __name__ == "__main__":
24
asyncio.run(main())

Run It

Terminal
python main.py

Swap the model

gpt-5-mini is the default here because it's fast enough for interactive iteration. Swap up to gpt-5 when you need higher-quality reasoning on harder pages — expect 15-40s per turn because of its reasoning stage.

agent = Agent(..., model="gpt-5") # slower, better reasoning

Next Steps