OpenAI Realtime Assistant

NOTE

A guide is work in progress for enabling interruptions for the Realtime Assistant.

We are going to create a simple example for communicating with the Assistant. This application will playback responses from text input.

Install the required packages

pip install openai[realtime] aioconsole

We will be using an modified script from azure_realtime.py

python

import asyncio
import aioconsole
from openai import AsyncOpenAI

API_KEY='YOUR_API_KEY'

async def Assistant():
    client = AsyncOpenAI(api_key=API_KEY)
    async with client.beta.realtime.connect(model="gpt-4o-realtime-preview") as connection:
        await connection.session.update(session={'modalities': ['text', 'audio']})

        while True:
            user_input = await aioconsole.ainput("Enter a message: ")
            if user_input == "q":
                break

            await connection.conversation.item.create(
                item={
                    "type": "message",
                    "role": "user",
                    "content": [{"type": "input_text", "text": user_input}],
                }
            )
            await connection.response.create()
            async for event in connection:
                if event.type == "response.audio.done":
                    break
                elif event.type == "response.audio.delta":
                    pass
asyncio.run(Assistant())

Identify the corresponding sample rate and sample format of the audio data.

According to their Realtime API Documentation, you can specify the audio data output_audio_format to be PCM with 24kHZ and 16 bit-depth. We shall adjust our and configure these settings:

python

import asyncio
import aioconsole
from openai import AsyncOpenAI
from miniaudio import SampleFormat

API_KEY='YOUR_API_KEY'

SAMPLE_RATE = 24000
SAMPLE_FORMAT = SampleFormat.SIGNED16

async def Assistant():
    client = AsyncOpenAI(api_key=API_KEY)
    async with client.beta.realtime.connect(model="gpt-4o-realtime-preview") as connection:
        await connection.session.update(session={'modalities': ['text', 'audio'], 'output_audio_format': 'pcm16'})

        while True:
            user_input = await aioconsole.ainput("Enter a message: ")
            if user_input == "q":
                break

            await connection.conversation.item.create(
                item={
                    "type": "message",
                    "role": "user",
                    "content": [{"type": "input_text", "text": user_input}],
                }
            )
            await connection.response.create()
            async for event in connection:
                if event.type == "response.audio.done":
                    break
                elif event.type == "response.audio.delta":
                    pass
asyncio.run(Assistant())

Create a custom generator to process the audio data.

The raw audio data is encoded in base64:

python

import asyncio
import aioconsole
from openai import AsyncOpenAI
from miniaudio import SampleFormat
from base64 import b64decode

API_KEY='YOUR_API_KEY'

SAMPLE_RATE = 24000
SAMPLE_FORMAT = SampleFormat.SIGNED16

async def process_audio(connection):
    async for event in connection:  
        if event.type == "response.audio.delta":
            audio = b64decode(event.delta)
            yield audio

async def Assistant():
    client = AsyncOpenAI(api_key=API_KEY)
    async with client.beta.realtime.connect(model="gpt-4o-realtime-preview") as connection:
        await connection.session.update(session={'modalities': ['text', 'audio'], 'output_audio_format': 'pcm16'})

        while True:
            user_input = await aioconsole.ainput("Enter a message: ")
            if user_input == "q":
                break

            await connection.conversation.item.create(
                item={
                    "type": "message",
                    "role": "user",
                    "content": [{"type": "input_text", "text": user_input}],
                }
            )
            await connection.response.create()
asyncio.run(Assistant())

Plug the audio stream into the Speakers with the corresponding sample format and sample rate:

python

import asyncio
import aioconsole
from openai import AsyncOpenAI
from miniaudio import SampleFormat
from minispeaker import Speakers
from base64 import b64decode

API_KEY='YOUR_API_KEY'

SAMPLE_RATE = 24000
SAMPLE_FORMAT = SampleFormat.SIGNED16

async def process_audio(connection):
    async for event in connection:  
        if event.type == "response.audio.delta":
            audio = b64decode(event.delta)
            yield audio

async def Assistant():
    client = AsyncOpenAI(api_key=API_KEY)
    speakers = Speakers(sample_rate=SAMPLE_RATE, sample_format=SAMPLE_FORMAT)
    async with client.beta.realtime.connect(model="gpt-4o-realtime-preview") as connection:
        speakers.play(process_audio(connection), name="speech")
        await connection.session.update(session={'modalities': ['text', 'audio'], 'output_audio_format': 'pcm16'})

        while True:
            user_input = await aioconsole.ainput("Enter a message: ")
            if user_input == "q":
                break

            await connection.conversation.item.create(
                item={
                    "type": "message",
                    "role": "user",
                    "content": [{"type": "input_text", "text": user_input}],
                }
            )
            await connection.response.create()
asyncio.run(Assistant())

WARNING

Selecting the incorrect sample rate and format may result in non-working audio playback.

OpenAI Realtime Assistant ​

OpenAI Realtime Assistant