aitest/6-tc-computer.ts

// Run with:
//   bun run 6-tc-computer.ts
// Override the user prompt with:
//   bun run 6-tc-computer.ts --prompt "Go to example.com and summarize the page."
// Note: this script intentionally leaves the Playwright browser open after the
// model reaches a final answer. Because the browser/context are not closed,
// Bun stays alive until you close the browser or stop the process manually.
// model reaches a final answer. Because the browser/context are not closed,
// Bun stays alive until you close the browser or stop the process manually.

import OpenAI from 'openai'
import readline from 'node:readline/promises'
import vm from 'node:vm'
import { chromium } from 'playwright'
import util from 'node:util'
import shell from 'shelljs'

const sleep = ms => new Promise((resolve, reject) => setTimeout(resolve, ms))

async function main (
  prompt: string = 'Show a welcome message on screen. After the task is complete, ask the user to confirm before ending the program, or ask user for more tasks to perform on the website.',
  model: string = 'gpt-5.2'
) {
  const client = new OpenAI({
    apiKey:
      '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'
  })
  const rl = readline.createInterface({
    input: process.stdin,
    output: process.stdout
  })
  const browser = await chromium.launch({
    headless: false,
    args: ['--window-size=1440,900']
  })
  const context = await browser.newContext({
    viewport: { width: 1440, height: 900 }
  })
  const page = await context.newPage()

  const conversation: any[] = []
  const js_output: any[] = []
  const sandbox: Record<string, any> = {
    console: {
      log: (...xs: any[]) => {
        js_output.push({
          type: 'input_text',
          text: util.formatWithOptions(
            { showHidden: false, getters: false, maxStringLength: 2000 },
            ...xs
          )
        })
      }
    },
    browser: browser,
    context: context,
    page: page,
    display: (base64_image: string) => {
      // Remove whitespace/newlines from base64 which can invalidate OpenAI API
      const cleanB64 = base64_image.replace(/\s+/g, '')
      js_output.push({
        type: 'input_image',
        image_url: `data:image/png;base64,${cleanB64}`,
        detail: 'original'
      })
    }
  }
  const ctx = vm.createContext(sandbox)

  conversation.push({
    role: 'user',
    content: prompt
  })

  while(true) {
    console.log(`\n=== ROUND ${i + 1} ===\n`)
    //await sleep(1000) // pause between rounds for readability
    const resp = await client.responses.create({
      model,
      tools: [
        {
          type: 'shell',
          environment: { type: 'local' }
        },
        {
          type: 'function' as const,
          name: 'exec_js',
          description:
            'Execute provided interactive JavaScript in a persistent REPL context.',
          parameters: {
            type: 'object',
            properties: {
              code: {
                type: 'string',
                description: `
JavaScript to execute. Write small snippets of interactive code. To persist variables or functions across tool calls, you must save them to globalThis. Code is executed in an async node:vm context, so you can use await. You have access to ONLY the following:
- console.log(x): Use this to read contents back to you. But be minimal: otherwise the output may be too long. Avoid using console.log() for large base64 payloads like screenshots or buffer. If you create an image or screenshot, pass the base64 string to display().
- display(base64_image_string): Use this to view a base64-encoded image.
- Do not write screenshots or image data to temporary files or disk just to pass them back. Keep image data in memory and send it directly to display().
- Do not assume package globals like Bun.file are available unless they are explicitly provided.
- browser: A playwright chromium browser instance.
- context: A playwright browser context with viewport 1440x900.
- page: A playwright page already created in that context.
`
              }
            },
            required: ['code'],
            additionalProperties: false
          }
        },
        {
          type: 'function' as const,
          name: 'ask_user',
          description:
            'Ask the user a clarification question and wait for their response.',
          parameters: {
            type: 'object',
            properties: {
              question: {
                type: 'string',
                description:
                  'The exact question to show the human. Use this instead of answering with a freeform clarifying question in a final answer.'
              }
            },
            required: ['question'],
            additionalProperties: false
          }
        },
        {
          type: 'function' as const,
          name: 'end_program',
          description:
            'End the program and close the browser. Call this when the task is complete.',
          parameters: {
            type: 'object',
            properties: {},
            required: []
          }
        }
      ],
      input: conversation,
      reasoning: {
        effort: 'low'
      }
    })

    // Save model outputs into the running conversation
    conversation.push(...resp.output)

    // Handle tool calls
    for (const item of resp.output) {
      console.log('response output item =', item)
      if (item.type == 'shell_call') {
        console.log('shell call item =', item)
        let shell_call_output: { stdout: string; stderr: string; outcome: { type: string; exit_code: number } }[] = [];
        item.action.commands.forEach(command => {
          console.log('$', command, '\n')

          let { stdout, stderr } = shell.exec(command, {
            silent: true
          })
          shell_call_output.push({
            stdout,
            stderr,
            outcome: { type: 'exit', exit_code: 0 }
          })
        })

        conversation.push({
          type: 'shell_call_output',
          call_id: item.call_id,
          output: shell_call_output
        })
      } else if (item.type === 'function_call' && item.name === 'exec_js') {
        const parsed = JSON.parse(item.arguments ?? '{}') as {
          code?: string
        }
        const code = parsed.code ?? ''
        console.log(code)
        console.log('----')
        const wrappedCode = `
                (async () => {
                    ${code}
                })();
            `

        try {
          await new vm.Script(wrappedCode, {
            filename: 'exec_js.js'
          }).runInContext(ctx)
        } catch (e: any) {
          sandbox.console.log(e, e?.message, e?.stack)
        }

        // Send tool output back to the model, keyed by call_id
        conversation.push({
          type: 'function_call_output',
          call_id: item.call_id,
          output: js_output.slice()
        })

        for (const out of js_output) {
          if (out.type === 'input_text') {
            console.log('JS LOG:', out.text)
          } else if (out.type === 'input_image') {
            console.log('JS IMAGE: [base64 string omitted]')
          }
        }
        console.log('=====')

        js_output.length = 0
      } else if (item.type === 'function_call' && item.name === 'ask_user') {
        const parsed = JSON.parse(item.arguments ?? '{}') as {
          question?: string
        }
        const question = parsed.question ?? 'Please provide more information.'
        console.log(`MODEL QUESTION: ${question}`)
        const answer = await rl.question('> ')
        conversation.push({
          type: 'function_call_output',
          call_id: item.call_id,
          output: answer
        })
      } else if (item.type === 'function_call' && item.name === 'end_program') {
        console.log('Closing browser and exiting program...')
        await browser.close()
        rl.close()
        process.exit(0)
      } else if (item.type === 'message') {
        console.log('Message:', item.content[0]?.text ?? item.content)
      }
    } // end of for loop iterations for tool calls and outputs

  } // end of main for loop for max_steps
}

function getCliPrompt (): string | undefined {
  const args = Bun.argv.slice(2)
  for (let i = 0; i < args.length; i++) {
    if (args[i] === '--prompt') {
      return args[i + 1]
    }
  }
  return undefined
}

main(getCliPrompt()).catch(err => {
  console.error('Error occurred:', err)
  process.exit(1)
})