Quickstart (Typescript)

How to use OpenAI Computer Use with Steel

This guide will walk you through how to use OpenAI's computer-use-preview model with Steel's Computer API to create AI agents that can navigate the web.

We'll be implementing a simple CUA loop that functions as described below:

Computer use - OpenAI API

Prerequisites

  • Node.js 20+

  • A Steel API key (sign up here)

  • An OpenAI API key with access to the computer-use-preview model

Step 1: Setup and Helper Functions

First, create a project directory and install the required packages:

Terminal
# Create a project directory
mkdir steel-openai-computer-use
cd steel-openai-computer-use
# Initialize package.json
npm init -y
# Install required packages
npm install steel-sdk dotenv
npm install -D @types/node typescript ts-node

Create a .env file with your API keys:

ENV
.env
1
STEEL_API_KEY=your_steel_api_key_here
2
OPENAI_API_KEY=your_openai_api_key_here
3
TASK=Go to Steel.dev and find the latest news

Create a file with helper functions, constants, and type definitions:

Typescript
helpers.ts
1
import * as dotenv from "dotenv";
2
import { Steel } from "steel-sdk";
3
4
dotenv.config();
5
6
export const STEEL_API_KEY = process.env.STEEL_API_KEY || "your-steel-api-key-here";
7
export const OPENAI_API_KEY = process.env.OPENAI_API_KEY || "your-openai-api-key-here";
8
export const TASK = process.env.TASK || "Go to Steel.dev and find the latest news";
9
10
export function formatToday(): string {
11
return new Intl.DateTimeFormat("en-US", {
12
weekday: "long",
13
month: "long",
14
day: "2-digit",
15
year: "numeric",
16
}).format(new Date());
17
}
18
19
export const BROWSER_SYSTEM_PROMPT = `<BROWSER_ENV>
20
- You control a headful Chromium browser running in a VM with internet access.
21
- Interact only through the computer tool (mouse/keyboard/scroll/screenshots). Do not call navigation functions.
22
- Today's date is ${formatToday()}.
23
</BROWSER_ENV>
24
25
<BROWSER_CONTROL>
26
- Before acting, take a screenshot to observe state.
27
- When typing into any input:
28
* Clear with Ctrl/⌘+A, then Delete.
29
* After submitting (Enter or clicking a button), take another screenshot and move the mouse aside.
30
- Computer calls are slow; batch related actions together.
31
- Zoom out or scroll so all relevant content is visible before reading.
32
- If the first screenshot is black, click near center and screenshot again.
33
</BROWSER_CONTROL>
34
35
<TASK_EXECUTION>
36
- You receive exactly one natural-language task and no further user feedback.
37
- Do not ask clarifying questions; make reasonable assumptions and proceed.
38
- Prefer minimal, high-signal actions that move directly toward the goal.
39
- Keep the final response concise and focused on fulfilling the task.
40
</TASK_EXECUTION>`;
41
42
export interface MessageItem {
43
type: "message";
44
content: Array<{ text: string }>;
45
}
46
47
export interface FunctionCallItem {
48
type: "function_call";
49
call_id: string;
50
name: string;
51
arguments: string;
52
}
53
54
export interface ComputerCallItem {
55
type: "computer_call";
56
call_id: string;
57
action: {
58
type: string;
59
[key: string]: any;
60
};
61
pending_safety_checks?: Array<{
62
id: string;
63
message: string;
64
}>;
65
}
66
67
export interface OutputItem {
68
type: "computer_call_output" | "function_call_output";
69
call_id: string;
70
acknowledged_safety_checks?: Array<{
71
id: string;
72
message: string;
73
}>;
74
output?:
75
| {
76
type: string;
77
image_url?: string;
78
}
79
| string;
80
}
81
82
export interface ResponseItem {
83
id: string;
84
output: (MessageItem | FunctionCallItem | ComputerCallItem)[];
85
}
86
87
export type Coordinates = [number, number];
88
89
export interface BaseActionRequest {
90
screenshot?: boolean;
91
hold_keys?: string[];
92
}
93
94
export type ComputerActionRequest =
95
| (BaseActionRequest & { action: "move_mouse"; coordinates: Coordinates })
96
| (BaseActionRequest & {
97
action: "click_mouse";
98
button: "left" | "right" | "middle" | "back" | "forward";
99
coordinates?: Coordinates;
100
num_clicks?: number;
101
click_type?: "down" | "up" | "click";
102
})
103
| (BaseActionRequest & { action: "drag_mouse"; path: Coordinates[] })
104
| (BaseActionRequest & {
105
action: "scroll";
106
coordinates?: Coordinates;
107
delta_x?: number;
108
delta_y?: number;
109
})
110
| (BaseActionRequest & { action: "press_key"; keys: string[]; duration?: number })
111
| (BaseActionRequest & { action: "type_text"; text: string })
112
| (BaseActionRequest & { action: "wait"; duration: number })
113
| { action: "take_screenshot" }
114
| { action: "get_cursor_position" };
115
116
export async function createResponse(params: any): Promise<ResponseItem> {
117
const url = "https://api.openai.com/v1/responses";
118
const headers: Record<string, string> = {
119
Authorization: `Bearer ${process.env.OPENAI_API_KEY}`,
120
"Content-Type": "application/json",
121
};
122
123
const openaiOrg = process.env.OPENAI_ORG;
124
if (openaiOrg) {
125
headers["Openai-Organization"] = openaiOrg;
126
}
127
128
const response = await fetch(url, {
129
method: "POST",
130
headers,
131
body: JSON.stringify(params),
132
});
133
134
if (!response.ok) {
135
const errorText = await response.text();
136
throw new Error(`OpenAI API Error: ${response.status} ${errorText}`);
137
}
138
139
return (await response.json()) as ResponseItem;
140
}
141
142
export { Steel };

Step 2: Create the Agent Class

Typescript
agent.ts
1
import {
2
Steel,
3
STEEL_API_KEY,
4
BROWSER_SYSTEM_PROMPT,
5
Coordinates,
6
ComputerActionRequest,
7
MessageItem,
8
FunctionCallItem,
9
ComputerCallItem,
10
OutputItem,
11
createResponse,
12
} from "./helpers";
13
14
export class Agent {
15
private steel: Steel;
16
private session: any | null = null;
17
private model: string;
18
private tools: any[];
19
private viewportWidth: number;
20
private viewportHeight: number;
21
private systemPrompt: string;
22
private printSteps: boolean = true;
23
private autoAcknowledgeSafety: boolean = true;
24
25
constructor() {
26
this.steel = new Steel({ steelAPIKey: STEEL_API_KEY });
27
this.model = "computer-use-preview";
28
this.viewportWidth = 1280;
29
this.viewportHeight = 768;
30
this.systemPrompt = BROWSER_SYSTEM_PROMPT;
31
this.tools = [
32
{
33
type: "computer-preview",
34
display_width: this.viewportWidth,
35
display_height: this.viewportHeight,
36
environment: "browser",
37
},
38
];
39
}
40
41
private center(): [number, number] {
42
return [
43
Math.floor(this.viewportWidth / 2),
44
Math.floor(this.viewportHeight / 2),
45
];
46
}
47
48
private toNumber(v: any, def = 0): number {
49
if (typeof v === "number") return v;
50
if (typeof v === "string") {
51
const n = parseFloat(v);
52
return Number.isFinite(n) ? n : def;
53
}
54
return def;
55
}
56
57
private toCoords(x?: any, y?: any): Coordinates {
58
const xx = this.toNumber(x, this.center()[0]);
59
const yy = this.toNumber(y, this.center()[1]);
60
return [xx, yy];
61
}
62
63
private splitKeys(k?: string | string[]): string[] {
64
if (Array.isArray(k)) return k.filter(Boolean) as string[];
65
if (!k) return [];
66
return k
67
.split("+")
68
.map((s) => s.trim())
69
.filter(Boolean);
70
}
71
72
private mapButton(btn?: string): "left" | "right" | "middle" | "back" | "forward" {
73
const b = (btn || "left").toLowerCase();
74
if (b === "right" || b === "middle" || b === "back" || b === "forward") return b;
75
return "left";
76
}
77
78
private normalizeKey(key: string): string {
79
if (!key) return key;
80
const k = String(key).trim();
81
const upper = k.toUpperCase();
82
const synonyms: Record<string, string> = {
83
ENTER: "Enter",
84
RETURN: "Enter",
85
ESC: "Escape",
86
ESCAPE: "Escape",
87
TAB: "Tab",
88
BACKSPACE: "Backspace",
89
DELETE: "Delete",
90
SPACE: "Space",
91
CTRL: "Control",
92
CONTROL: "Control",
93
ALT: "Alt",
94
SHIFT: "Shift",
95
META: "Meta",
96
CMD: "Meta",
97
UP: "ArrowUp",
98
DOWN: "ArrowDown",
99
LEFT: "ArrowLeft",
100
RIGHT: "ArrowRight",
101
HOME: "Home",
102
END: "End",
103
PAGEUP: "PageUp",
104
PAGEDOWN: "PageDown",
105
};
106
if (upper in synonyms) return synonyms[upper];
107
if (upper.startsWith("F") && /^\d+$/.test(upper.slice(1))) {
108
return "F" + upper.slice(1);
109
}
110
return k;
111
}
112
113
private normalizeKeys(keys: string[]): string[] {
114
return keys.map((k) => this.normalizeKey(k));
115
}
116
117
async initialize(): Promise<void> {
118
const width = this.viewportWidth;
119
const height = this.viewportHeight;
120
this.session = await this.steel.sessions.create({
121
dimensions: { width, height },
122
blockAds: true,
123
timeout: 900000,
124
});
125
console.log("Steel Session created successfully!");
126
console.log(`View live session at: ${this.session.sessionViewerUrl}`);
127
}
128
129
async cleanup(): Promise<void> {
130
if (this.session) {
131
console.log("Releasing Steel session...");
132
await this.steel.sessions.release(this.session.id);
133
console.log(
134
`Session completed. View replay at ${this.session.sessionViewerUrl}`
135
);
136
this.session = null;
137
}
138
}
139
140
private async takeScreenshot(): Promise<string> {
141
const resp: any = await this.steel.sessions.computer(this.session!.id, {
142
action: "take_screenshot",
143
});
144
const img: string | undefined = resp?.base64_image;
145
if (!img) throw new Error("No screenshot returned from Steel");
146
return img;
147
}
148
149
private async executeComputerAction(
150
actionType: string,
151
actionArgs: any
152
): Promise<string> {
153
let body: ComputerActionRequest | null = null;
154
155
switch (actionType) {
156
case "move": {
157
const coords = this.toCoords(actionArgs.x, actionArgs.y);
158
body = {
159
action: "move_mouse",
160
coordinates: coords,
161
screenshot: true,
162
};
163
break;
164
}
165
case "click": {
166
const coords = this.toCoords(actionArgs.x, actionArgs.y);
167
const button = this.mapButton(actionArgs.button);
168
const clicks = this.toNumber(actionArgs.num_clicks, 1);
169
body = {
170
action: "click_mouse",
171
button,
172
coordinates: coords,
173
...(clicks > 1 ? { num_clicks: clicks } : {}),
174
screenshot: true,
175
};
176
break;
177
}
178
case "doubleClick":
179
case "double_click": {
180
const coords = this.toCoords(actionArgs.x, actionArgs.y);
181
body = {
182
action: "click_mouse",
183
button: "left",
184
coordinates: coords,
185
num_clicks: 2,
186
screenshot: true,
187
};
188
break;
189
}
190
case "drag": {
191
const path = Array.isArray(actionArgs.path) ? actionArgs.path : [];
192
const steelPath: Coordinates[] = path.map((p: any) =>
193
this.toCoords(p.x, p.y)
194
);
195
if (steelPath.length < 2) {
196
const [cx, cy] = this.center();
197
steelPath.unshift([cx, cy]);
198
}
199
body = {
200
action: "drag_mouse",
201
path: steelPath,
202
screenshot: true,
203
};
204
break;
205
}
206
case "scroll": {
207
const coords =
208
actionArgs.x != null || actionArgs.y != null
209
? this.toCoords(actionArgs.x, actionArgs.y)
210
: undefined;
211
const delta_x = this.toNumber(actionArgs.scroll_x, 0);
212
const delta_y = this.toNumber(actionArgs.scroll_y, 0);
213
body = {
214
action: "scroll",
215
...(coords ? { coordinates: coords } : {}),
216
...(delta_x !== 0 ? { delta_x } : {}),
217
...(delta_y !== 0 ? { delta_y } : {}),
218
screenshot: true,
219
};
220
break;
221
}
222
case "type": {
223
const text = typeof actionArgs.text === "string" ? actionArgs.text : "";
224
body = {
225
action: "type_text",
226
text,
227
screenshot: true,
228
};
229
break;
230
}
231
case "keypress": {
232
const keys = Array.isArray(actionArgs.keys)
233
? actionArgs.keys
234
: this.splitKeys(actionArgs.keys);
235
const normalized = this.normalizeKeys(keys);
236
body = {
237
action: "press_key",
238
keys: normalized,
239
screenshot: true,
240
};
241
break;
242
}
243
case "wait": {
244
const ms = this.toNumber(actionArgs.ms, 1000);
245
const seconds = Math.max(0.001, ms / 1000);
246
body = {
247
action: "wait",
248
duration: seconds,
249
screenshot: true,
250
};
251
break;
252
}
253
case "screenshot": {
254
return this.takeScreenshot();
255
}
256
default: {
257
return this.takeScreenshot();
258
}
259
}
260
261
const resp: any = await this.steel.sessions.computer(
262
this.session!.id,
263
body!
264
);
265
const img: string | undefined = resp?.base64_image;
266
if (img) return img;
267
return this.takeScreenshot();
268
}
269
270
private async handleItem(
271
item: MessageItem | FunctionCallItem | ComputerCallItem
272
): Promise<OutputItem[]> {
273
if (item.type === "message") {
274
if (this.printSteps) {
275
console.log(item.content[0].text);
276
}
277
return [];
278
}
279
280
if (item.type === "function_call") {
281
if (this.printSteps) {
282
console.log(`${item.name}(${item.arguments})`);
283
}
284
return [
285
{
286
type: "function_call_output",
287
call_id: item.call_id,
288
output: "success",
289
},
290
];
291
}
292
293
if (item.type === "computer_call") {
294
const { action } = item;
295
const actionType = action.type;
296
const { type, ...actionArgs } = action;
297
298
if (this.printSteps) {
299
console.log(`${actionType}(${JSON.stringify(actionArgs)})`);
300
}
301
302
const screenshotBase64 = await this.executeComputerAction(
303
actionType,
304
actionArgs
305
);
306
307
const pendingChecks = item.pending_safety_checks || [];
308
for (const check of pendingChecks) {
309
if (this.autoAcknowledgeSafety) {
310
console.log(`⚠️ Auto-acknowledging safety check: ${check.message}`);
311
} else {
312
throw new Error(`Safety check failed: ${check.message}`);
313
}
314
}
315
316
const callOutput: OutputItem = {
317
type: "computer_call_output",
318
call_id: item.call_id,
319
acknowledged_safety_checks: pendingChecks,
320
output: {
321
type: "input_image",
322
image_url: `data:image/png;base64,${screenshotBase64}`,
323
},
324
};
325
326
return [callOutput];
327
}
328
329
return [];
330
}
331
332
async executeTask(
333
task: string,
334
printSteps: boolean = true,
335
debug: boolean = false,
336
maxIterations: number = 50
337
): Promise<string> {
338
this.printSteps = printSteps;
339
340
const inputItems = [
341
{
342
role: "system",
343
content: this.systemPrompt,
344
},
345
{
346
role: "user",
347
content: task,
348
},
349
];
350
351
let newItems: any[] = [];
352
let iterations = 0;
353
let consecutiveNoActions = 0;
354
let lastAssistantTexts: string[] = [];
355
356
console.log(`🎯 Executing task: ${task}`);
357
console.log("=".repeat(60));
358
359
const detectRepetition = (text: string): boolean => {
360
if (lastAssistantTexts.length < 2) return false;
361
const words1 = text.toLowerCase().split(/\s+/);
362
return lastAssistantTexts.some((prev) => {
363
const words2 = prev.toLowerCase().split(/\s+/);
364
const common = words1.filter((w) => words2.includes(w));
365
return common.length / Math.max(words1.length, words2.length) > 0.8;
366
});
367
};
368
369
while (iterations < maxIterations) {
370
iterations++;
371
let hasActions = false;
372
373
if (
374
newItems.length > 0 &&
375
newItems[newItems.length - 1]?.role === "assistant"
376
) {
377
const last = newItems[newItems.length - 1];
378
const content = last.content?.[0]?.text;
379
if (content) {
380
if (detectRepetition(content)) {
381
console.log("🔄 Repetition detected - stopping execution");
382
lastAssistantTexts.push(content);
383
break;
384
}
385
lastAssistantTexts.push(content);
386
if (lastAssistantTexts.length > 3) lastAssistantTexts.shift();
387
}
388
}
389
390
try {
391
const response = await createResponse({
392
model: this.model,
393
input: [...inputItems, ...newItems],
394
tools: this.tools,
395
truncation: "auto",
396
});
397
398
if (!response.output) {
399
throw new Error("No output from model");
400
}
401
402
newItems.push(...response.output);
403
404
for (const item of response.output) {
405
if (item.type === "computer_call" || item.type === "function_call") {
406
hasActions = true;
407
}
408
const handleResult = await this.handleItem(item);
409
newItems.push(...handleResult);
410
}
411
412
if (!hasActions) {
413
consecutiveNoActions++;
414
if (consecutiveNoActions >= 3) {
415
console.log(
416
"⚠️ No actions for 3 consecutive iterations - stopping"
417
);
418
break;
419
}
420
} else {
421
consecutiveNoActions = 0;
422
}
423
} catch (error) {
424
console.error(`❌ Error during task execution: ${error}`);
425
throw error;
426
}
427
}
428
429
if (iterations >= maxIterations) {
430
console.warn(
431
`⚠️ Task execution stopped after ${maxIterations} iterations`
432
);
433
}
434
435
const assistantMessages = newItems.filter(
436
(item) => item.role === "assistant"
437
);
438
const finalMessage = assistantMessages[assistantMessages.length - 1];
439
440
return (
441
finalMessage?.content?.[0]?.text ||
442
"Task execution completed (no final message)"
443
);
444
}
445
}

Step 3: Create the Main Script

Typescript
main.ts
1
import { Agent } from "./agent";
2
import { STEEL_API_KEY, OPENAI_API_KEY, TASK } from "./helpers";
3
4
async function main(): Promise<void> {
5
console.log("🚀 Steel + OpenAI Computer Use Assistant");
6
console.log("=".repeat(60));
7
8
if (STEEL_API_KEY === "your-steel-api-key-here") {
9
console.warn(
10
"⚠️ WARNING: Please replace 'your-steel-api-key-here' with your actual Steel API key"
11
);
12
console.warn(
13
" Get your API key at: https://app.steel.dev/settings/api-keys"
14
);
15
throw new Error("Set STEEL_API_KEY");
16
}
17
18
if (OPENAI_API_KEY === "your-openai-api-key-here") {
19
console.warn(
20
"⚠️ WARNING: Please replace 'your-openai-api-key-here' with your actual OpenAI API key"
21
);
22
console.warn(" Get your API key at: https://platform.openai.com/");
23
throw new Error("Set OPENAI_API_KEY");
24
}
25
26
console.log("\nStarting Steel session...");
27
const agent = new Agent();
28
29
try {
30
await agent.initialize();
31
console.log("✅ Steel session started!");
32
33
const startTime = Date.now();
34
const result = await agent.executeTask(TASK, true, false, 50);
35
const duration = ((Date.now() - startTime) / 1000).toFixed(1);
36
37
console.log("\n" + "=".repeat(60));
38
console.log("🎉 TASK EXECUTION COMPLETED");
39
console.log("=".repeat(60));
40
console.log(`⏱️ Duration: ${duration} seconds`);
41
console.log(`🎯 Task: ${TASK}`);
42
console.log(`📋 Result:\n${result}`);
43
console.log("=".repeat(60));
44
} catch (error) {
45
console.log(`❌ Failed to run: ${error}`);
46
throw error;
47
} finally {
48
await agent.cleanup();
49
}
50
}
51
52
main()
53
.then(() => {
54
process.exit(0);
55
})
56
.catch((error) => {
57
console.error("Task execution failed:", error);
58
process.exit(1);
59
});

Running Your Agent

Execute your script to start an interactive AI browser session:

Terminal
npx ts-node main.ts

The agent will execute the task defined in the TASK environment variable or the default task. You can modify the task by setting the environment variable:

Terminal
export TASK="Research the top 5 electric vehicles with the longest range"
npx ts-node main.ts

You'll see each action the agent takes displayed in the console, and you can view the live browser session by opening the session URL in your web browser.

Next Steps

  • Explore the Steel API documentation for more advanced features

  • Check out the OpenAI documentation for more information about the computer-use-preview model

  • Add additional features like session recording or multi-session management