src/resources/beta/realtime/realtime.ts

// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.

import { APIResource } from '../../../resource';
import * as RealtimeAPI from './realtime';
import * as SessionsAPI from './sessions';
import {
  Session as SessionsAPISession,
  SessionCreateParams,
  SessionCreateResponse,
  Sessions,
} from './sessions';

export class Realtime extends APIResource {
  sessions: SessionsAPI.Sessions = new SessionsAPI.Sessions(this._client);
}

/**
 * Returned when a conversation is created. Emitted right after session creation.
 */
export interface ConversationCreatedEvent {
  /**
   * The conversation resource.
   */
  conversation: ConversationCreatedEvent.Conversation;

  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The event type, must be `conversation.created`.
   */
  type: 'conversation.created';
}

export namespace ConversationCreatedEvent {
  /**
   * The conversation resource.
   */
  export interface Conversation {
    /**
     * The unique ID of the conversation.
     */
    id?: string;

    /**
     * The object type, must be `realtime.conversation`.
     */
    object?: 'realtime.conversation';
  }
}

/**
 * The item to add to the conversation.
 */
export interface ConversationItem {
  /**
   * The unique ID of the item, this can be generated by the client to help manage
   * server-side context, but is not required because the server will generate one if
   * not provided.
   */
  id?: string;

  /**
   * The arguments of the function call (for `function_call` items).
   */
  arguments?: string;

  /**
   * The ID of the function call (for `function_call` and `function_call_output`
   * items). If passed on a `function_call_output` item, the server will check that a
   * `function_call` item with the same ID exists in the conversation history.
   */
  call_id?: string;

  /**
   * The content of the message, applicable for `message` items.
   *
   * - Message items of role `system` support only `input_text` content
   * - Message items of role `user` support `input_text` and `input_audio` content
   * - Message items of role `assistant` support `text` content.
   */
  content?: Array<ConversationItemContent>;

  /**
   * The name of the function being called (for `function_call` items).
   */
  name?: string;

  /**
   * Identifier for the API object being returned - always `realtime.item`.
   */
  object?: 'realtime.item';

  /**
   * The output of the function call (for `function_call_output` items).
   */
  output?: string;

  /**
   * The role of the message sender (`user`, `assistant`, `system`), only applicable
   * for `message` items.
   */
  role?: 'user' | 'assistant' | 'system';

  /**
   * The status of the item (`completed`, `incomplete`). These have no effect on the
   * conversation, but are accepted for consistency with the
   * `conversation.item.created` event.
   */
  status?: 'completed' | 'incomplete';

  /**
   * The type of the item (`message`, `function_call`, `function_call_output`).
   */
  type?: 'message' | 'function_call' | 'function_call_output';
}

export interface ConversationItemContent {
  /**
   * ID of a previous conversation item to reference (for `item_reference` content
   * types in `response.create` events). These can reference both client and server
   * created items.
   */
  id?: string;

  /**
   * Base64-encoded audio bytes, used for `input_audio` content type.
   */
  audio?: string;

  /**
   * The text content, used for `input_text` and `text` content types.
   */
  text?: string;

  /**
   * The transcript of the audio, used for `input_audio` content type.
   */
  transcript?: string;

  /**
   * The content type (`input_text`, `input_audio`, `item_reference`, `text`).
   */
  type?: 'input_text' | 'input_audio' | 'item_reference' | 'text';
}

/**
 * Add a new Item to the Conversation's context, including messages, function
 * calls, and function call responses. This event can be used both to populate a
 * "history" of the conversation and to add new items mid-stream, but has the
 * current limitation that it cannot populate assistant audio messages.
 *
 * If successful, the server will respond with a `conversation.item.created` event,
 * otherwise an `error` event will be sent.
 */
export interface ConversationItemCreateEvent {
  /**
   * The item to add to the conversation.
   */
  item: ConversationItem;

  /**
   * The event type, must be `conversation.item.create`.
   */
  type: 'conversation.item.create';

  /**
   * Optional client-generated ID used to identify this event.
   */
  event_id?: string;

  /**
   * The ID of the preceding item after which the new item will be inserted. If not
   * set, the new item will be appended to the end of the conversation. If set to
   * `root`, the new item will be added to the beginning of the conversation. If set
   * to an existing ID, it allows an item to be inserted mid-conversation. If the ID
   * cannot be found, an error will be returned and the item will not be added.
   */
  previous_item_id?: string;
}

/**
 * Returned when a conversation item is created. There are several scenarios that
 * produce this event:
 *
 * - The server is generating a Response, which if successful will produce either
 *   one or two Items, which will be of type `message` (role `assistant`) or type
 *   `function_call`.
 * - The input audio buffer has been committed, either by the client or the server
 *   (in `server_vad` mode). The server will take the content of the input audio
 *   buffer and add it to a new user message Item.
 * - The client has sent a `conversation.item.create` event to add a new Item to
 *   the Conversation.
 */
export interface ConversationItemCreatedEvent {
  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The item to add to the conversation.
   */
  item: ConversationItem;

  /**
   * The ID of the preceding item in the Conversation context, allows the client to
   * understand the order of the conversation.
   */
  previous_item_id: string;

  /**
   * The event type, must be `conversation.item.created`.
   */
  type: 'conversation.item.created';
}

/**
 * Send this event when you want to remove any item from the conversation history.
 * The server will respond with a `conversation.item.deleted` event, unless the
 * item does not exist in the conversation history, in which case the server will
 * respond with an error.
 */
export interface ConversationItemDeleteEvent {
  /**
   * The ID of the item to delete.
   */
  item_id: string;

  /**
   * The event type, must be `conversation.item.delete`.
   */
  type: 'conversation.item.delete';

  /**
   * Optional client-generated ID used to identify this event.
   */
  event_id?: string;
}

/**
 * Returned when an item in the conversation is deleted by the client with a
 * `conversation.item.delete` event. This event is used to synchronize the server's
 * understanding of the conversation history with the client's view.
 */
export interface ConversationItemDeletedEvent {
  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The ID of the item that was deleted.
   */
  item_id: string;

  /**
   * The event type, must be `conversation.item.deleted`.
   */
  type: 'conversation.item.deleted';
}

/**
 * This event is the output of audio transcription for user audio written to the
 * user audio buffer. Transcription begins when the input audio buffer is committed
 * by the client or server (in `server_vad` mode). Transcription runs
 * asynchronously with Response creation, so this event may come before or after
 * the Response events.
 *
 * Realtime API models accept audio natively, and thus input transcription is a
 * separate process run on a separate ASR (Automatic Speech Recognition) model,
 * currently always `whisper-1`. Thus the transcript may diverge somewhat from the
 * model's interpretation, and should be treated as a rough guide.
 */
export interface ConversationItemInputAudioTranscriptionCompletedEvent {
  /**
   * The index of the content part containing the audio.
   */
  content_index: number;

  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The ID of the user message item containing the audio.
   */
  item_id: string;

  /**
   * The transcribed text.
   */
  transcript: string;

  /**
   * The event type, must be `conversation.item.input_audio_transcription.completed`.
   */
  type: 'conversation.item.input_audio_transcription.completed';
}

/**
 * Returned when input audio transcription is configured, and a transcription
 * request for a user message failed. These events are separate from other `error`
 * events so that the client can identify the related Item.
 */
export interface ConversationItemInputAudioTranscriptionFailedEvent {
  /**
   * The index of the content part containing the audio.
   */
  content_index: number;

  /**
   * Details of the transcription error.
   */
  error: ConversationItemInputAudioTranscriptionFailedEvent.Error;

  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The ID of the user message item.
   */
  item_id: string;

  /**
   * The event type, must be `conversation.item.input_audio_transcription.failed`.
   */
  type: 'conversation.item.input_audio_transcription.failed';
}

export namespace ConversationItemInputAudioTranscriptionFailedEvent {
  /**
   * Details of the transcription error.
   */
  export interface Error {
    /**
     * Error code, if any.
     */
    code?: string;

    /**
     * A human-readable error message.
     */
    message?: string;

    /**
     * Parameter related to the error, if any.
     */
    param?: string;

    /**
     * The type of error.
     */
    type?: string;
  }
}

/**
 * Send this event to truncate a previous assistant message’s audio. The server
 * will produce audio faster than realtime, so this event is useful when the user
 * interrupts to truncate audio that has already been sent to the client but not
 * yet played. This will synchronize the server's understanding of the audio with
 * the client's playback.
 *
 * Truncating audio will delete the server-side text transcript to ensure there is
 * not text in the context that hasn't been heard by the user.
 *
 * If successful, the server will respond with a `conversation.item.truncated`
 * event.
 */
export interface ConversationItemTruncateEvent {
  /**
   * Inclusive duration up to which audio is truncated, in milliseconds. If the
   * audio_end_ms is greater than the actual audio duration, the server will respond
   * with an error.
   */
  audio_end_ms: number;

  /**
   * The index of the content part to truncate. Set this to 0.
   */
  content_index: number;

  /**
   * The ID of the assistant message item to truncate. Only assistant message items
   * can be truncated.
   */
  item_id: string;

  /**
   * The event type, must be `conversation.item.truncate`.
   */
  type: 'conversation.item.truncate';

  /**
   * Optional client-generated ID used to identify this event.
   */
  event_id?: string;
}

/**
 * Returned when an earlier assistant audio message item is truncated by the client
 * with a `conversation.item.truncate` event. This event is used to synchronize the
 * server's understanding of the audio with the client's playback.
 *
 * This action will truncate the audio and remove the server-side text transcript
 * to ensure there is no text in the context that hasn't been heard by the user.
 */
export interface ConversationItemTruncatedEvent {
  /**
   * The duration up to which the audio was truncated, in milliseconds.
   */
  audio_end_ms: number;

  /**
   * The index of the content part that was truncated.
   */
  content_index: number;

  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The ID of the assistant message item that was truncated.
   */
  item_id: string;

  /**
   * The event type, must be `conversation.item.truncated`.
   */
  type: 'conversation.item.truncated';
}

/**
 * Returned when an error occurs, which could be a client problem or a server
 * problem. Most errors are recoverable and the session will stay open, we
 * recommend to implementors to monitor and log error messages by default.
 */
export interface ErrorEvent {
  /**
   * Details of the error.
   */
  error: ErrorEvent.Error;

  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The event type, must be `error`.
   */
  type: 'error';
}

export namespace ErrorEvent {
  /**
   * Details of the error.
   */
  export interface Error {
    /**
     * A human-readable error message.
     */
    message: string;

    /**
     * The type of error (e.g., "invalid_request_error", "server_error").
     */
    type: string;

    /**
     * Error code, if any.
     */
    code?: string | null;

    /**
     * The event_id of the client event that caused the error, if applicable.
     */
    event_id?: string | null;

    /**
     * Parameter related to the error, if any.
     */
    param?: string | null;
  }
}

/**
 * Send this event to append audio bytes to the input audio buffer. The audio
 * buffer is temporary storage you can write to and later commit. In Server VAD
 * mode, the audio buffer is used to detect speech and the server will decide when
 * to commit. When Server VAD is disabled, you must commit the audio buffer
 * manually.
 *
 * The client may choose how much audio to place in each event up to a maximum of
 * 15 MiB, for example streaming smaller chunks from the client may allow the VAD
 * to be more responsive. Unlike made other client events, the server will not send
 * a confirmation response to this event.
 */
export interface InputAudioBufferAppendEvent {
  /**
   * Base64-encoded audio bytes. This must be in the format specified by the
   * `input_audio_format` field in the session configuration.
   */
  audio: string;

  /**
   * The event type, must be `input_audio_buffer.append`.
   */
  type: 'input_audio_buffer.append';

  /**
   * Optional client-generated ID used to identify this event.
   */
  event_id?: string;
}

/**
 * Send this event to clear the audio bytes in the buffer. The server will respond
 * with an `input_audio_buffer.cleared` event.
 */
export interface InputAudioBufferClearEvent {
  /**
   * The event type, must be `input_audio_buffer.clear`.
   */
  type: 'input_audio_buffer.clear';

  /**
   * Optional client-generated ID used to identify this event.
   */
  event_id?: string;
}

/**
 * Returned when the input audio buffer is cleared by the client with a
 * `input_audio_buffer.clear` event.
 */
export interface InputAudioBufferClearedEvent {
  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The event type, must be `input_audio_buffer.cleared`.
   */
  type: 'input_audio_buffer.cleared';
}

/**
 * Send this event to commit the user input audio buffer, which will create a new
 * user message item in the conversation. This event will produce an error if the
 * input audio buffer is empty. When in Server VAD mode, the client does not need
 * to send this event, the server will commit the audio buffer automatically.
 *
 * Committing the input audio buffer will trigger input audio transcription (if
 * enabled in session configuration), but it will not create a response from the
 * model. The server will respond with an `input_audio_buffer.committed` event.
 */
export interface InputAudioBufferCommitEvent {
  /**
   * The event type, must be `input_audio_buffer.commit`.
   */
  type: 'input_audio_buffer.commit';

  /**
   * Optional client-generated ID used to identify this event.
   */
  event_id?: string;
}

/**
 * Returned when an input audio buffer is committed, either by the client or
 * automatically in server VAD mode. The `item_id` property is the ID of the user
 * message item that will be created, thus a `conversation.item.created` event will
 * also be sent to the client.
 */
export interface InputAudioBufferCommittedEvent {
  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The ID of the user message item that will be created.
   */
  item_id: string;

  /**
   * The ID of the preceding item after which the new item will be inserted.
   */
  previous_item_id: string;

  /**
   * The event type, must be `input_audio_buffer.committed`.
   */
  type: 'input_audio_buffer.committed';
}

/**
 * Sent by the server when in `server_vad` mode to indicate that speech has been
 * detected in the audio buffer. This can happen any time audio is added to the
 * buffer (unless speech is already detected). The client may want to use this
 * event to interrupt audio playback or provide visual feedback to the user.
 *
 * The client should expect to receive a `input_audio_buffer.speech_stopped` event
 * when speech stops. The `item_id` property is the ID of the user message item
 * that will be created when speech stops and will also be included in the
 * `input_audio_buffer.speech_stopped` event (unless the client manually commits
 * the audio buffer during VAD activation).
 */
export interface InputAudioBufferSpeechStartedEvent {
  /**
   * Milliseconds from the start of all audio written to the buffer during the
   * session when speech was first detected. This will correspond to the beginning of
   * audio sent to the model, and thus includes the `prefix_padding_ms` configured in
   * the Session.
   */
  audio_start_ms: number;

  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The ID of the user message item that will be created when speech stops.
   */
  item_id: string;

  /**
   * The event type, must be `input_audio_buffer.speech_started`.
   */
  type: 'input_audio_buffer.speech_started';
}

/**
 * Returned in `server_vad` mode when the server detects the end of speech in the
 * audio buffer. The server will also send an `conversation.item.created` event
 * with the user message item that is created from the audio buffer.
 */
export interface InputAudioBufferSpeechStoppedEvent {
  /**
   * Milliseconds since the session started when speech stopped. This will correspond
   * to the end of audio sent to the model, and thus includes the
   * `min_silence_duration_ms` configured in the Session.
   */
  audio_end_ms: number;

  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The ID of the user message item that will be created.
   */
  item_id: string;

  /**
   * The event type, must be `input_audio_buffer.speech_stopped`.
   */
  type: 'input_audio_buffer.speech_stopped';
}

/**
 * Emitted at the beginning of a Response to indicate the updated rate limits. When
 * a Response is created some tokens will be "reserved" for the output tokens, the
 * rate limits shown here reflect that reservation, which is then adjusted
 * accordingly once the Response is completed.
 */
export interface RateLimitsUpdatedEvent {
  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * List of rate limit information.
   */
  rate_limits: Array<RateLimitsUpdatedEvent.RateLimit>;

  /**
   * The event type, must be `rate_limits.updated`.
   */
  type: 'rate_limits.updated';
}

export namespace RateLimitsUpdatedEvent {
  export interface RateLimit {
    /**
     * The maximum allowed value for the rate limit.
     */
    limit?: number;

    /**
     * The name of the rate limit (`requests`, `tokens`).
     */
    name?: 'requests' | 'tokens';

    /**
     * The remaining value before the limit is reached.
     */
    remaining?: number;

    /**
     * Seconds until the rate limit resets.
     */
    reset_seconds?: number;
  }
}

/**
 * All events that the client can send to the Realtime API
 */
export type RealtimeClientEvent =
  | SessionUpdateEvent
  | InputAudioBufferAppendEvent
  | InputAudioBufferCommitEvent
  | InputAudioBufferClearEvent
  | ConversationItemCreateEvent
  | ConversationItemTruncateEvent
  | ConversationItemDeleteEvent
  | ResponseCreateEvent
  | ResponseCancelEvent;

/**
 * The response resource.
 */
export interface RealtimeResponse {
  /**
   * The unique ID of the response.
   */
  id?: string;

  /**
   * Developer-provided string key-value pairs associated with this response.
   */
  metadata?: unknown | null;

  /**
   * The object type, must be `realtime.response`.
   */
  object?: 'realtime.response';

  /**
   * The list of output items generated by the response.
   */
  output?: Array<ConversationItem>;

  /**
   * The final status of the response (`completed`, `cancelled`, `failed`, or
   * `incomplete`).
   */
  status?: 'completed' | 'cancelled' | 'failed' | 'incomplete';

  /**
   * Additional details about the status.
   */
  status_details?: RealtimeResponseStatus;

  /**
   * Usage statistics for the Response, this will correspond to billing. A Realtime
   * API session will maintain a conversation context and append new Items to the
   * Conversation, thus output from previous turns (text and audio tokens) will
   * become the input for later turns.
   */
  usage?: RealtimeResponseUsage;
}

/**
 * Additional details about the status.
 */
export interface RealtimeResponseStatus {
  /**
   * A description of the error that caused the response to fail, populated when the
   * `status` is `failed`.
   */
  error?: RealtimeResponseStatus.Error;

  /**
   * The reason the Response did not complete. For a `cancelled` Response, one of
   * `turn_detected` (the server VAD detected a new start of speech) or
   * `client_cancelled` (the client sent a cancel event). For an `incomplete`
   * Response, one of `max_output_tokens` or `content_filter` (the server-side safety
   * filter activated and cut off the response).
   */
  reason?: 'turn_detected' | 'client_cancelled' | 'max_output_tokens' | 'content_filter';

  /**
   * The type of error that caused the response to fail, corresponding with the
   * `status` field (`completed`, `cancelled`, `incomplete`, `failed`).
   */
  type?: 'completed' | 'cancelled' | 'incomplete' | 'failed';
}

export namespace RealtimeResponseStatus {
  /**
   * A description of the error that caused the response to fail, populated when the
   * `status` is `failed`.
   */
  export interface Error {
    /**
     * Error code, if any.
     */
    code?: string;

    /**
     * The type of error.
     */
    type?: string;
  }
}

/**
 * Usage statistics for the Response, this will correspond to billing. A Realtime
 * API session will maintain a conversation context and append new Items to the
 * Conversation, thus output from previous turns (text and audio tokens) will
 * become the input for later turns.
 */
export interface RealtimeResponseUsage {
  /**
   * Details about the input tokens used in the Response.
   */
  input_token_details?: RealtimeResponseUsage.InputTokenDetails;

  /**
   * The number of input tokens used in the Response, including text and audio
   * tokens.
   */
  input_tokens?: number;

  /**
   * Details about the output tokens used in the Response.
   */
  output_token_details?: RealtimeResponseUsage.OutputTokenDetails;

  /**
   * The number of output tokens sent in the Response, including text and audio
   * tokens.
   */
  output_tokens?: number;

  /**
   * The total number of tokens in the Response including input and output text and
   * audio tokens.
   */
  total_tokens?: number;
}

export namespace RealtimeResponseUsage {
  /**
   * Details about the input tokens used in the Response.
   */
  export interface InputTokenDetails {
    /**
     * The number of audio tokens used in the Response.
     */
    audio_tokens?: number;

    /**
     * The number of cached tokens used in the Response.
     */
    cached_tokens?: number;

    /**
     * The number of text tokens used in the Response.
     */
    text_tokens?: number;
  }

  /**
   * Details about the output tokens used in the Response.
   */
  export interface OutputTokenDetails {
    /**
     * The number of audio tokens used in the Response.
     */
    audio_tokens?: number;

    /**
     * The number of text tokens used in the Response.
     */
    text_tokens?: number;
  }
}

/**
 * All events that the Realtime API can send back
 */
export type RealtimeServerEvent =
  | ErrorEvent
  | SessionCreatedEvent
  | SessionUpdatedEvent
  | ConversationCreatedEvent
  | InputAudioBufferCommittedEvent
  | InputAudioBufferClearedEvent
  | InputAudioBufferSpeechStartedEvent
  | InputAudioBufferSpeechStoppedEvent
  | ConversationItemCreatedEvent
  | ConversationItemInputAudioTranscriptionCompletedEvent
  | ConversationItemInputAudioTranscriptionFailedEvent
  | ConversationItemTruncatedEvent
  | ConversationItemDeletedEvent
  | ResponseCreatedEvent
  | ResponseDoneEvent
  | ResponseOutputItemAddedEvent
  | ResponseOutputItemDoneEvent
  | ResponseContentPartAddedEvent
  | ResponseContentPartDoneEvent
  | ResponseTextDeltaEvent
  | ResponseTextDoneEvent
  | ResponseAudioTranscriptDeltaEvent
  | ResponseAudioTranscriptDoneEvent
  | ResponseAudioDeltaEvent
  | ResponseAudioDoneEvent
  | ResponseFunctionCallArgumentsDeltaEvent
  | ResponseFunctionCallArgumentsDoneEvent
  | RateLimitsUpdatedEvent;

/**
 * Returned when the model-generated audio is updated.
 */
export interface ResponseAudioDeltaEvent {
  /**
   * The index of the content part in the item's content array.
   */
  content_index: number;

  /**
   * Base64-encoded audio data delta.
   */
  delta: string;

  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The ID of the item.
   */
  item_id: string;

  /**
   * The index of the output item in the response.
   */
  output_index: number;

  /**
   * The ID of the response.
   */
  response_id: string;

  /**
   * The event type, must be `response.audio.delta`.
   */
  type: 'response.audio.delta';
}

/**
 * Returned when the model-generated audio is done. Also emitted when a Response is
 * interrupted, incomplete, or cancelled.
 */
export interface ResponseAudioDoneEvent {
  /**
   * The index of the content part in the item's content array.
   */
  content_index: number;

  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The ID of the item.
   */
  item_id: string;

  /**
   * The index of the output item in the response.
   */
  output_index: number;

  /**
   * The ID of the response.
   */
  response_id: string;

  /**
   * The event type, must be `response.audio.done`.
   */
  type: 'response.audio.done';
}

/**
 * Returned when the model-generated transcription of audio output is updated.
 */
export interface ResponseAudioTranscriptDeltaEvent {
  /**
   * The index of the content part in the item's content array.
   */
  content_index: number;

  /**
   * The transcript delta.
   */
  delta: string;

  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The ID of the item.
   */
  item_id: string;

  /**
   * The index of the output item in the response.
   */
  output_index: number;

  /**
   * The ID of the response.
   */
  response_id: string;

  /**
   * The event type, must be `response.audio_transcript.delta`.
   */
  type: 'response.audio_transcript.delta';
}

/**
 * Returned when the model-generated transcription of audio output is done
 * streaming. Also emitted when a Response is interrupted, incomplete, or
 * cancelled.
 */
export interface ResponseAudioTranscriptDoneEvent {
  /**
   * The index of the content part in the item's content array.
   */
  content_index: number;

  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The ID of the item.
   */
  item_id: string;

  /**
   * The index of the output item in the response.
   */
  output_index: number;

  /**
   * The ID of the response.
   */
  response_id: string;

  /**
   * The final transcript of the audio.
   */
  transcript: string;

  /**
   * The event type, must be `response.audio_transcript.done`.
   */
  type: 'response.audio_transcript.done';
}

/**
 * Send this event to cancel an in-progress response. The server will respond with
 * a `response.cancelled` event or an error if there is no response to cancel.
 */
export interface ResponseCancelEvent {
  /**
   * The event type, must be `response.cancel`.
   */
  type: 'response.cancel';

  /**
   * Optional client-generated ID used to identify this event.
   */
  event_id?: string;

  /**
   * A specific response ID to cancel - if not provided, will cancel an in-progress
   * response in the default conversation.
   */
  response_id?: string;
}

/**
 * Returned when a new content part is added to an assistant message item during
 * response generation.
 */
export interface ResponseContentPartAddedEvent {
  /**
   * The index of the content part in the item's content array.
   */
  content_index: number;

  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The ID of the item to which the content part was added.
   */
  item_id: string;

  /**
   * The index of the output item in the response.
   */
  output_index: number;

  /**
   * The content part that was added.
   */
  part: ResponseContentPartAddedEvent.Part;

  /**
   * The ID of the response.
   */
  response_id: string;

  /**
   * The event type, must be `response.content_part.added`.
   */
  type: 'response.content_part.added';
}

export namespace ResponseContentPartAddedEvent {
  /**
   * The content part that was added.
   */
  export interface Part {
    /**
     * Base64-encoded audio data (if type is "audio").
     */
    audio?: string;

    /**
     * The text content (if type is "text").
     */
    text?: string;

    /**
     * The transcript of the audio (if type is "audio").
     */
    transcript?: string;

    /**
     * The content type ("text", "audio").
     */
    type?: 'text' | 'audio';
  }
}

/**
 * Returned when a content part is done streaming in an assistant message item.
 * Also emitted when a Response is interrupted, incomplete, or cancelled.
 */
export interface ResponseContentPartDoneEvent {
  /**
   * The index of the content part in the item's content array.
   */
  content_index: number;

  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The ID of the item.
   */
  item_id: string;

  /**
   * The index of the output item in the response.
   */
  output_index: number;

  /**
   * The content part that is done.
   */
  part: ResponseContentPartDoneEvent.Part;

  /**
   * The ID of the response.
   */
  response_id: string;

  /**
   * The event type, must be `response.content_part.done`.
   */
  type: 'response.content_part.done';
}

export namespace ResponseContentPartDoneEvent {
  /**
   * The content part that is done.
   */
  export interface Part {
    /**
     * Base64-encoded audio data (if type is "audio").
     */
    audio?: string;

    /**
     * The text content (if type is "text").
     */
    text?: string;

    /**
     * The transcript of the audio (if type is "audio").
     */
    transcript?: string;

    /**
     * The content type ("text", "audio").
     */
    type?: 'text' | 'audio';
  }
}

/**
 * This event instructs the server to create a Response, which means triggering
 * model inference. When in Server VAD mode, the server will create Responses
 * automatically.
 *
 * A Response will include at least one Item, and may have two, in which case the
 * second will be a function call. These Items will be appended to the conversation
 * history.
 *
 * The server will respond with a `response.created` event, events for Items and
 * content created, and finally a `response.done` event to indicate the Response is
 * complete.
 *
 * The `response.create` event includes inference configuration like
 * `instructions`, and `temperature`. These fields will override the Session's
 * configuration for this Response only.
 */
export interface ResponseCreateEvent {
  /**
   * The event type, must be `response.create`.
   */
  type: 'response.create';

  /**
   * Optional client-generated ID used to identify this event.
   */
  event_id?: string;

  /**
   * Create a new Realtime response with these parameters
   */
  response?: ResponseCreateEvent.Response;
}

export namespace ResponseCreateEvent {
  /**
   * Create a new Realtime response with these parameters
   */
  export interface Response {
    /**
     * Controls which conversation the response is added to. Currently supports `auto`
     * and `none`, with `auto` as the default value. The `auto` value means that the
     * contents of the response will be added to the default conversation. Set this to
     * `none` to create an out-of-band response which will not add items to default
     * conversation.
     */
    conversation?: (string & {}) | 'auto' | 'none';

    /**
     * Input items to include in the prompt for the model. Creates a new context for
     * this response, without including the default conversation. Can include
     * references to items from the default conversation.
     */
    input?: Array<RealtimeAPI.ConversationItem>;

    /**
     * The default system instructions (i.e. system message) prepended to model calls.
     * This field allows the client to guide the model on desired responses. The model
     * can be instructed on response content and format, (e.g. "be extremely succinct",
     * "act friendly", "here are examples of good responses") and on audio behavior
     * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
     * instructions are not guaranteed to be followed by the model, but they provide
     * guidance to the model on the desired behavior.
     *
     * Note that the server sets default instructions which will be used if this field
     * is not set and are visible in the `session.created` event at the start of the
     * session.
     */
    instructions?: string;

    /**
     * Maximum number of output tokens for a single assistant response, inclusive of
     * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
     * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
     */
    max_response_output_tokens?: number | 'inf';

    /**
     * Set of 16 key-value pairs that can be attached to an object. This can be useful
     * for storing additional information about the object in a structured format. Keys
     * can be a maximum of 64 characters long and values can be a maximum of 512
     * characters long.
     */
    metadata?: unknown | null;

    /**
     * The set of modalities the model can respond with. To disable audio, set this to
     * ["text"].
     */
    modalities?: Array<'text' | 'audio'>;

    /**
     * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
     */
    output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';

    /**
     * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
     */
    temperature?: number;

    /**
     * How the model chooses tools. Options are `auto`, `none`, `required`, or specify
     * a function, like `{"type": "function", "function": {"name": "my_function"}}`.
     */
    tool_choice?: string;

    /**
     * Tools (functions) available to the model.
     */
    tools?: Array<Response.Tool>;

    /**
     * The voice the model uses to respond. Voice cannot be changed during the session
     * once the model has responded with audio at least once. Current voice options are
     * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
     */
    voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
  }

  export namespace Response {
    export interface Tool {
      /**
       * The description of the function, including guidance on when and how to call it,
       * and guidance about what to tell the user when calling (if anything).
       */
      description?: string;

      /**
       * The name of the function.
       */
      name?: string;

      /**
       * Parameters of the function in JSON Schema.
       */
      parameters?: unknown;

      /**
       * The type of the tool, i.e. `function`.
       */
      type?: 'function';
    }
  }
}

/**
 * Returned when a new Response is created. The first event of response creation,
 * where the response is in an initial state of `in_progress`.
 */
export interface ResponseCreatedEvent {
  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The response resource.
   */
  response: RealtimeResponse;

  /**
   * The event type, must be `response.created`.
   */
  type: 'response.created';
}

/**
 * Returned when a Response is done streaming. Always emitted, no matter the final
 * state. The Response object included in the `response.done` event will include
 * all output Items in the Response but will omit the raw audio data.
 */
export interface ResponseDoneEvent {
  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The response resource.
   */
  response: RealtimeResponse;

  /**
   * The event type, must be `response.done`.
   */
  type: 'response.done';
}

/**
 * Returned when the model-generated function call arguments are updated.
 */
export interface ResponseFunctionCallArgumentsDeltaEvent {
  /**
   * The ID of the function call.
   */
  call_id: string;

  /**
   * The arguments delta as a JSON string.
   */
  delta: string;

  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The ID of the function call item.
   */
  item_id: string;

  /**
   * The index of the output item in the response.
   */
  output_index: number;

  /**
   * The ID of the response.
   */
  response_id: string;

  /**
   * The event type, must be `response.function_call_arguments.delta`.
   */
  type: 'response.function_call_arguments.delta';
}

/**
 * Returned when the model-generated function call arguments are done streaming.
 * Also emitted when a Response is interrupted, incomplete, or cancelled.
 */
export interface ResponseFunctionCallArgumentsDoneEvent {
  /**
   * The final arguments as a JSON string.
   */
  arguments: string;

  /**
   * The ID of the function call.
   */
  call_id: string;

  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The ID of the function call item.
   */
  item_id: string;

  /**
   * The index of the output item in the response.
   */
  output_index: number;

  /**
   * The ID of the response.
   */
  response_id: string;

  /**
   * The event type, must be `response.function_call_arguments.done`.
   */
  type: 'response.function_call_arguments.done';
}

/**
 * Returned when a new Item is created during Response generation.
 */
export interface ResponseOutputItemAddedEvent {
  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The item to add to the conversation.
   */
  item: ConversationItem;

  /**
   * The index of the output item in the Response.
   */
  output_index: number;

  /**
   * The ID of the Response to which the item belongs.
   */
  response_id: string;

  /**
   * The event type, must be `response.output_item.added`.
   */
  type: 'response.output_item.added';
}

/**
 * Returned when an Item is done streaming. Also emitted when a Response is
 * interrupted, incomplete, or cancelled.
 */
export interface ResponseOutputItemDoneEvent {
  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The item to add to the conversation.
   */
  item: ConversationItem;

  /**
   * The index of the output item in the Response.
   */
  output_index: number;

  /**
   * The ID of the Response to which the item belongs.
   */
  response_id: string;

  /**
   * The event type, must be `response.output_item.done`.
   */
  type: 'response.output_item.done';
}

/**
 * Returned when the text value of a "text" content part is updated.
 */
export interface ResponseTextDeltaEvent {
  /**
   * The index of the content part in the item's content array.
   */
  content_index: number;

  /**
   * The text delta.
   */
  delta: string;

  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The ID of the item.
   */
  item_id: string;

  /**
   * The index of the output item in the response.
   */
  output_index: number;

  /**
   * The ID of the response.
   */
  response_id: string;

  /**
   * The event type, must be `response.text.delta`.
   */
  type: 'response.text.delta';
}

/**
 * Returned when the text value of a "text" content part is done streaming. Also
 * emitted when a Response is interrupted, incomplete, or cancelled.
 */
export interface ResponseTextDoneEvent {
  /**
   * The index of the content part in the item's content array.
   */
  content_index: number;

  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * The ID of the item.
   */
  item_id: string;

  /**
   * The index of the output item in the response.
   */
  output_index: number;

  /**
   * The ID of the response.
   */
  response_id: string;

  /**
   * The final text content.
   */
  text: string;

  /**
   * The event type, must be `response.text.done`.
   */
  type: 'response.text.done';
}

/**
 * Returned when a Session is created. Emitted automatically when a new connection
 * is established as the first server event. This event will contain the default
 * Session configuration.
 */
export interface SessionCreatedEvent {
  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * Realtime session object configuration.
   */
  session: SessionsAPI.Session;

  /**
   * The event type, must be `session.created`.
   */
  type: 'session.created';
}

/**
 * Send this event to update the session’s default configuration. The client may
 * send this event at any time to update the session configuration, and any field
 * may be updated at any time, except for "voice". The server will respond with a
 * `session.updated` event that shows the full effective configuration. Only fields
 * that are present are updated, thus the correct way to clear a field like
 * "instructions" is to pass an empty string.
 */
export interface SessionUpdateEvent {
  /**
   * Realtime session object configuration.
   */
  session: SessionUpdateEvent.Session;

  /**
   * The event type, must be `session.update`.
   */
  type: 'session.update';

  /**
   * Optional client-generated ID used to identify this event.
   */
  event_id?: string;
}

export namespace SessionUpdateEvent {
  /**
   * Realtime session object configuration.
   */
  export interface Session {
    /**
     * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
     * `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
     * (mono), and little-endian byte order.
     */
    input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';

    /**
     * Configuration for input audio transcription, defaults to off and can be set to
     * `null` to turn off once on. Input audio transcription is not native to the
     * model, since the model consumes audio directly. Transcription runs
     * asynchronously through Whisper and should be treated as rough guidance rather
     * than the representation understood by the model.
     */
    input_audio_transcription?: Session.InputAudioTranscription;

    /**
     * The default system instructions (i.e. system message) prepended to model calls.
     * This field allows the client to guide the model on desired responses. The model
     * can be instructed on response content and format, (e.g. "be extremely succinct",
     * "act friendly", "here are examples of good responses") and on audio behavior
     * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
     * instructions are not guaranteed to be followed by the model, but they provide
     * guidance to the model on the desired behavior.
     *
     * Note that the server sets default instructions which will be used if this field
     * is not set and are visible in the `session.created` event at the start of the
     * session.
     */
    instructions?: string;

    /**
     * Maximum number of output tokens for a single assistant response, inclusive of
     * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
     * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
     */
    max_response_output_tokens?: number | 'inf';

    /**
     * The set of modalities the model can respond with. To disable audio, set this to
     * ["text"].
     */
    modalities?: Array<'text' | 'audio'>;

    /**
     * The Realtime model used for this session.
     */
    model?:
      | 'gpt-4o-realtime-preview'
      | 'gpt-4o-realtime-preview-2024-10-01'
      | 'gpt-4o-realtime-preview-2024-12-17'
      | 'gpt-4o-mini-realtime-preview'
      | 'gpt-4o-mini-realtime-preview-2024-12-17';

    /**
     * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
     * For `pcm16`, output audio is sampled at a rate of 24kHz.
     */
    output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';

    /**
     * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
     */
    temperature?: number;

    /**
     * How the model chooses tools. Options are `auto`, `none`, `required`, or specify
     * a function.
     */
    tool_choice?: string;

    /**
     * Tools (functions) available to the model.
     */
    tools?: Array<Session.Tool>;

    /**
     * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
     * means that the model will detect the start and end of speech based on audio
     * volume and respond at the end of user speech.
     */
    turn_detection?: Session.TurnDetection;

    /**
     * The voice the model uses to respond. Voice cannot be changed during the session
     * once the model has responded with audio at least once. Current voice options are
     * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
     */
    voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
  }

  export namespace Session {
    /**
     * Configuration for input audio transcription, defaults to off and can be set to
     * `null` to turn off once on. Input audio transcription is not native to the
     * model, since the model consumes audio directly. Transcription runs
     * asynchronously through Whisper and should be treated as rough guidance rather
     * than the representation understood by the model.
     */
    export interface InputAudioTranscription {
      /**
       * The model to use for transcription, `whisper-1` is the only currently supported
       * model.
       */
      model?: string;
    }

    export interface Tool {
      /**
       * The description of the function, including guidance on when and how to call it,
       * and guidance about what to tell the user when calling (if anything).
       */
      description?: string;

      /**
       * The name of the function.
       */
      name?: string;

      /**
       * Parameters of the function in JSON Schema.
       */
      parameters?: unknown;

      /**
       * The type of the tool, i.e. `function`.
       */
      type?: 'function';
    }

    /**
     * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
     * means that the model will detect the start and end of speech based on audio
     * volume and respond at the end of user speech.
     */
    export interface TurnDetection {
      /**
       * Whether or not to automatically generate a response when VAD is enabled. `true`
       * by default.
       */
      create_response?: boolean;

      /**
       * Amount of audio to include before the VAD detected speech (in milliseconds).
       * Defaults to 300ms.
       */
      prefix_padding_ms?: number;

      /**
       * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
       * With shorter values the model will respond more quickly, but may jump in on
       * short pauses from the user.
       */
      silence_duration_ms?: number;

      /**
       * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
       * threshold will require louder audio to activate the model, and thus might
       * perform better in noisy environments.
       */
      threshold?: number;

      /**
       * Type of turn detection, only `server_vad` is currently supported.
       */
      type?: string;
    }
  }
}

/**
 * Returned when a session is updated with a `session.update` event, unless there
 * is an error.
 */
export interface SessionUpdatedEvent {
  /**
   * The unique ID of the server event.
   */
  event_id: string;

  /**
   * Realtime session object configuration.
   */
  session: SessionsAPI.Session;

  /**
   * The event type, must be `session.updated`.
   */
  type: 'session.updated';
}

Realtime.Sessions = Sessions;

export declare namespace Realtime {
  export {
    Sessions as Sessions,
    type SessionsAPISession as Session,
    type SessionCreateResponse as SessionCreateResponse,
    type SessionCreateParams as SessionCreateParams,
  };
}