From d92ea4850f3720ba7a372f7bc9f8ecff07392ba0 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
<142633134+stainless-app[bot]@users.noreply.github.com>
Date: Thu, 11 Sep 2025 18:03:52 +0000
Subject: [PATCH] chore(api): Minor docs and type updates for realtime
---
.stats.yml | 4 +-
api.md | 4 +-
realtime/clientsecret.go | 313 ++++++++++++++++-------
realtime/clientsecret_test.go | 18 +-
realtime/realtime.go | 454 ++++++++++++++++++++++++++--------
responses/response.go | 32 +--
shared/constant/constants.go | 6 +
7 files changed, 605 insertions(+), 226 deletions(-)
diff --git a/.stats.yml b/.stats.yml
index 03c3e4c..1561159 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,4 +1,4 @@
configured_endpoints: 106
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-16cb18bed32bae8c5840fb39a1bf664026cc40463ad0c487dcb0df1bd3d72db0.yml
-openapi_spec_hash: 4cb51b22f98dee1a90bc7add82d1d132
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-c829f9e7f51d4946dae7b02eb37eb857b538a464cf54c7ced5eff1b1c93e07db.yml
+openapi_spec_hash: 1b2eaba46b264bcec8831bc496543649
config_hash: 930dac3aa861344867e4ac84f037b5df
diff --git a/api.md b/api.md
index 6fad482..74d733d 100644
--- a/api.md
+++ b/api.md
@@ -793,7 +793,7 @@ Params Types:
- realtime.RealtimeAudioConfigInputParam
- realtime.RealtimeAudioConfigOutputParam
- realtime.RealtimeAudioFormatsUnionParam
-- realtime.RealtimeAudioInputTurnDetectionParam
+- realtime.RealtimeAudioInputTurnDetectionUnionParam
- realtime.RealtimeFunctionToolParam
- realtime.RealtimeSessionCreateRequestParam
- realtime.RealtimeToolChoiceConfigUnionParam
@@ -802,7 +802,7 @@ Params Types:
- realtime.RealtimeTracingConfigUnionParam
- realtime.RealtimeTranscriptionSessionAudioParam
- realtime.RealtimeTranscriptionSessionAudioInputParam
-- realtime.RealtimeTranscriptionSessionAudioInputTurnDetectionParam
+- realtime.RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam
- realtime.RealtimeTranscriptionSessionCreateRequestParam
- realtime.RealtimeTruncationUnionParam
- realtime.RealtimeTruncationRetentionRatioParam
diff --git a/realtime/clientsecret.go b/realtime/clientsecret.go
index ba1dc14..cb9e36b 100644
--- a/realtime/clientsecret.go
+++ b/realtime/clientsecret.go
@@ -8,7 +8,6 @@ import (
"net/http"
"github.com/openai/openai-go/v2/internal/apijson"
- "github.com/openai/openai-go/v2/internal/paramutil"
"github.com/openai/openai-go/v2/internal/requestconfig"
"github.com/openai/openai-go/v2/option"
"github.com/openai/openai-go/v2/packages/param"
@@ -192,15 +191,18 @@ type RealtimeSessionCreateResponseAudioInput struct {
Transcription AudioTranscription `json:"transcription"`
// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
// set to `null` to turn off, in which case the client must manually trigger model
- // response. Server VAD means that the model will detect the start and end of
- // speech based on audio volume and respond at the end of user speech. Semantic VAD
- // is more advanced and uses a turn detection model (in conjunction with VAD) to
- // semantically estimate whether the user has finished speaking, then dynamically
- // sets a timeout based on this probability. For example, if user audio trails off
- // with "uhhm", the model will score a low probability of turn end and wait longer
- // for the user to continue speaking. This can be useful for more natural
- // conversations, but may have a higher latency.
- TurnDetection RealtimeSessionCreateResponseAudioInputTurnDetection `json:"turn_detection"`
+ // response.
+ //
+ // Server VAD means that the model will detect the start and end of speech based on
+ // audio volume and respond at the end of user speech.
+ //
+ // Semantic VAD is more advanced and uses a turn detection model (in conjunction
+ // with VAD) to semantically estimate whether the user has finished speaking, then
+ // dynamically sets a timeout based on this probability. For example, if user audio
+ // trails off with "uhhm", the model will score a low probability of turn end and
+ // wait longer for the user to continue speaking. This can be useful for more
+ // natural conversations, but may have a higher latency.
+ TurnDetection RealtimeSessionCreateResponseAudioInputTurnDetectionUnion `json:"turn_detection,nullable"`
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
JSON struct {
Format respjson.Field
@@ -244,29 +246,118 @@ func (r *RealtimeSessionCreateResponseAudioInputNoiseReduction) UnmarshalJSON(da
return apijson.UnmarshalRoot(data, r)
}
-// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
-// set to `null` to turn off, in which case the client must manually trigger model
-// response. Server VAD means that the model will detect the start and end of
-// speech based on audio volume and respond at the end of user speech. Semantic VAD
-// is more advanced and uses a turn detection model (in conjunction with VAD) to
-// semantically estimate whether the user has finished speaking, then dynamically
-// sets a timeout based on this probability. For example, if user audio trails off
-// with "uhhm", the model will score a low probability of turn end and wait longer
-// for the user to continue speaking. This can be useful for more natural
-// conversations, but may have a higher latency.
-type RealtimeSessionCreateResponseAudioInputTurnDetection struct {
+// RealtimeSessionCreateResponseAudioInputTurnDetectionUnion contains all possible
+// properties and values from
+// [RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad],
+// [RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad].
+//
+// Use the [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion.AsAny] method
+// to switch on the variant.
+//
+// Use the methods beginning with 'As' to cast the union to one of its variants.
+type RealtimeSessionCreateResponseAudioInputTurnDetectionUnion struct {
+ // Any of "server_vad", "semantic_vad".
+ Type string `json:"type"`
+ CreateResponse bool `json:"create_response"`
+ // This field is from variant
+ // [RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad].
+ IdleTimeoutMs int64 `json:"idle_timeout_ms"`
+ InterruptResponse bool `json:"interrupt_response"`
+ // This field is from variant
+ // [RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad].
+ PrefixPaddingMs int64 `json:"prefix_padding_ms"`
+ // This field is from variant
+ // [RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad].
+ SilenceDurationMs int64 `json:"silence_duration_ms"`
+ // This field is from variant
+ // [RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad].
+ Threshold float64 `json:"threshold"`
+ // This field is from variant
+ // [RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad].
+ Eagerness string `json:"eagerness"`
+ JSON struct {
+ Type respjson.Field
+ CreateResponse respjson.Field
+ IdleTimeoutMs respjson.Field
+ InterruptResponse respjson.Field
+ PrefixPaddingMs respjson.Field
+ SilenceDurationMs respjson.Field
+ Threshold respjson.Field
+ Eagerness respjson.Field
+ raw string
+ } `json:"-"`
+}
+
+// anyRealtimeSessionCreateResponseAudioInputTurnDetection is implemented by each
+// variant of [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion] to add
+// type safety for the return type of
+// [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion.AsAny]
+type anyRealtimeSessionCreateResponseAudioInputTurnDetection interface {
+ implRealtimeSessionCreateResponseAudioInputTurnDetectionUnion()
+}
+
+func (RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad) implRealtimeSessionCreateResponseAudioInputTurnDetectionUnion() {
+}
+func (RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad) implRealtimeSessionCreateResponseAudioInputTurnDetectionUnion() {
+}
+
+// Use the following switch statement to find the correct variant
+//
+// switch variant := RealtimeSessionCreateResponseAudioInputTurnDetectionUnion.AsAny().(type) {
+// case realtime.RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad:
+// case realtime.RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad:
+// default:
+// fmt.Errorf("no variant present")
+// }
+func (u RealtimeSessionCreateResponseAudioInputTurnDetectionUnion) AsAny() anyRealtimeSessionCreateResponseAudioInputTurnDetection {
+ switch u.Type {
+ case "server_vad":
+ return u.AsServerVad()
+ case "semantic_vad":
+ return u.AsSemanticVad()
+ }
+ return nil
+}
+
+func (u RealtimeSessionCreateResponseAudioInputTurnDetectionUnion) AsServerVad() (v RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad) {
+ apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v)
+ return
+}
+
+func (u RealtimeSessionCreateResponseAudioInputTurnDetectionUnion) AsSemanticVad() (v RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad) {
+ apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v)
+ return
+}
+
+// Returns the unmodified JSON received from the API
+func (u RealtimeSessionCreateResponseAudioInputTurnDetectionUnion) RawJSON() string {
+ return u.JSON.raw
+}
+
+func (r *RealtimeSessionCreateResponseAudioInputTurnDetectionUnion) UnmarshalJSON(data []byte) error {
+ return apijson.UnmarshalRoot(data, r)
+}
+
+// Server-side voice activity detection (VAD) which flips on when user speech is
+// detected and off after a period of silence.
+type RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad struct {
+ // Type of turn detection, `server_vad` to turn on simple Server VAD.
+ Type constant.ServerVad `json:"type,required"`
// Whether or not to automatically generate a response when a VAD stop event
// occurs.
CreateResponse bool `json:"create_response"`
- // Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
- // will wait longer for the user to continue speaking, `high` will respond more
- // quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
- // and `high` have max timeouts of 8s, 4s, and 2s respectively.
+ // Optional timeout after which a model response will be triggered automatically.
+ // This is useful for situations in which a long pause from the user is unexpected,
+ // such as a phone call. The model will effectively prompt the user to continue the
+ // conversation based on the current context.
//
- // Any of "low", "medium", "high", "auto".
- Eagerness string `json:"eagerness"`
- // Optional idle timeout after which turn detection will auto-timeout when no
- // additional audio is received and emits a `timeout_triggered` event.
+ // The timeout value will be applied after the last model response's audio has
+ // finished playing, i.e. it's set to the `response.done` time plus audio playback
+ // duration.
+ //
+ // An `input_audio_buffer.timeout_triggered` event (plus events associated with the
+ // Response) will be emitted when the timeout is reached. Idle timeout is currently
+ // only supported for `server_vad` mode.
IdleTimeoutMs int64 `json:"idle_timeout_ms,nullable"`
// Whether or not to automatically interrupt any ongoing response with output to
// the default conversation (i.e. `conversation` of `auto`) when a VAD start event
@@ -283,28 +374,63 @@ type RealtimeSessionCreateResponseAudioInputTurnDetection struct {
// defaults to 0.5. A higher threshold will require louder audio to activate the
// model, and thus might perform better in noisy environments.
Threshold float64 `json:"threshold"`
- // Type of turn detection.
- //
- // Any of "server_vad", "semantic_vad".
- Type string `json:"type"`
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
JSON struct {
+ Type respjson.Field
CreateResponse respjson.Field
- Eagerness respjson.Field
IdleTimeoutMs respjson.Field
InterruptResponse respjson.Field
PrefixPaddingMs respjson.Field
SilenceDurationMs respjson.Field
Threshold respjson.Field
- Type respjson.Field
ExtraFields map[string]respjson.Field
raw string
} `json:"-"`
}
// Returns the unmodified JSON received from the API
-func (r RealtimeSessionCreateResponseAudioInputTurnDetection) RawJSON() string { return r.JSON.raw }
-func (r *RealtimeSessionCreateResponseAudioInputTurnDetection) UnmarshalJSON(data []byte) error {
+func (r RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad) RawJSON() string {
+ return r.JSON.raw
+}
+func (r *RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad) UnmarshalJSON(data []byte) error {
+ return apijson.UnmarshalRoot(data, r)
+}
+
+// Server-side semantic turn detection which uses a model to determine when the
+// user has finished speaking.
+type RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad struct {
+ // Type of turn detection, `semantic_vad` to turn on Semantic VAD.
+ Type constant.SemanticVad `json:"type,required"`
+ // Whether or not to automatically generate a response when a VAD stop event
+ // occurs.
+ CreateResponse bool `json:"create_response"`
+ // Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+ // will wait longer for the user to continue speaking, `high` will respond more
+ // quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+ // and `high` have max timeouts of 8s, 4s, and 2s respectively.
+ //
+ // Any of "low", "medium", "high", "auto".
+ Eagerness string `json:"eagerness"`
+ // Whether or not to automatically interrupt any ongoing response with output to
+ // the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+ // occurs.
+ InterruptResponse bool `json:"interrupt_response"`
+ // JSON contains metadata for fields, check presence with [respjson.Field.Valid].
+ JSON struct {
+ Type respjson.Field
+ CreateResponse respjson.Field
+ Eagerness respjson.Field
+ InterruptResponse respjson.Field
+ ExtraFields map[string]respjson.Field
+ raw string
+ } `json:"-"`
+}
+
+// Returns the unmodified JSON received from the API
+func (r RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad) RawJSON() string {
+ return r.JSON.raw
+}
+func (r *RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad) UnmarshalJSON(data []byte) error {
return apijson.UnmarshalRoot(data, r)
}
@@ -1152,7 +1278,8 @@ type ClientSecretNewResponseSessionUnionAudioInput struct {
NoiseReduction ClientSecretNewResponseSessionUnionAudioInputNoiseReduction `json:"noise_reduction"`
// This field is from variant [RealtimeSessionCreateResponseAudioInput].
Transcription AudioTranscription `json:"transcription"`
- // This field is a union of [RealtimeSessionCreateResponseAudioInputTurnDetection],
+ // This field is a union of
+ // [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion],
// [RealtimeTranscriptionSessionTurnDetection]
TurnDetection ClientSecretNewResponseSessionUnionAudioInputTurnDetection `json:"turn_detection"`
JSON struct {
@@ -1197,31 +1324,27 @@ func (r *ClientSecretNewResponseSessionUnionAudioInputNoiseReduction) UnmarshalJ
// For type safety it is recommended to directly use a variant of the
// [ClientSecretNewResponseSessionUnion].
type ClientSecretNewResponseSessionUnionAudioInputTurnDetection struct {
+ Type string `json:"type"`
+ CreateResponse bool `json:"create_response"`
// This field is from variant
- // [RealtimeSessionCreateResponseAudioInputTurnDetection].
- CreateResponse bool `json:"create_response"`
- // This field is from variant
- // [RealtimeSessionCreateResponseAudioInputTurnDetection].
- Eagerness string `json:"eagerness"`
- // This field is from variant
- // [RealtimeSessionCreateResponseAudioInputTurnDetection].
- IdleTimeoutMs int64 `json:"idle_timeout_ms"`
- // This field is from variant
- // [RealtimeSessionCreateResponseAudioInputTurnDetection].
+ // [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion].
+ IdleTimeoutMs int64 `json:"idle_timeout_ms"`
InterruptResponse bool `json:"interrupt_response"`
PrefixPaddingMs int64 `json:"prefix_padding_ms"`
SilenceDurationMs int64 `json:"silence_duration_ms"`
Threshold float64 `json:"threshold"`
- Type string `json:"type"`
- JSON struct {
+ // This field is from variant
+ // [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion].
+ Eagerness string `json:"eagerness"`
+ JSON struct {
+ Type respjson.Field
CreateResponse respjson.Field
- Eagerness respjson.Field
IdleTimeoutMs respjson.Field
InterruptResponse respjson.Field
PrefixPaddingMs respjson.Field
SilenceDurationMs respjson.Field
Threshold respjson.Field
- Type respjson.Field
+ Eagerness respjson.Field
raw string
} `json:"-"`
}
@@ -1518,45 +1641,49 @@ func (u clientSecretNewParamsSessionUnionAudioInput) GetTranscription() *AudioTr
func (u clientSecretNewParamsSessionUnionAudioInput) GetTurnDetection() (res clientSecretNewParamsSessionUnionAudioInputTurnDetection) {
switch vt := u.any.(type) {
case *RealtimeAudioConfigInputParam:
- res.any = &vt.TurnDetection
+ res.any = vt.TurnDetection
case *RealtimeTranscriptionSessionAudioInputParam:
- res.any = &vt.TurnDetection
+ res.any = vt.TurnDetection
}
return res
}
-// Can have the runtime types [*RealtimeAudioInputTurnDetectionParam],
-// [*RealtimeTranscriptionSessionAudioInputTurnDetectionParam]
+// Can have the runtime types [*RealtimeAudioInputTurnDetectionServerVadParam],
+// [*RealtimeAudioInputTurnDetectionSemanticVadParam],
+// [*RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam],
+// [*RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam]
type clientSecretNewParamsSessionUnionAudioInputTurnDetection struct{ any }
// Use the following switch statement to get the type of the union:
//
// switch u.AsAny().(type) {
-// case *realtime.RealtimeAudioInputTurnDetectionParam:
-// case *realtime.RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
+// case *realtime.RealtimeAudioInputTurnDetectionServerVadParam:
+// case *realtime.RealtimeAudioInputTurnDetectionSemanticVadParam:
+// case *realtime.RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam:
+// case *realtime.RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam:
// default:
// fmt.Errorf("not present")
// }
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) AsAny() any { return u.any }
// Returns a pointer to the underlying variant's property, if present.
-func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetCreateResponse() *bool {
+func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetType() *string {
switch vt := u.any.(type) {
- case *RealtimeAudioInputTurnDetectionParam:
- return paramutil.AddrIfPresent(vt.CreateResponse)
- case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
- return paramutil.AddrIfPresent(vt.CreateResponse)
+ case *RealtimeAudioInputTurnDetectionUnionParam:
+ return vt.GetType()
+ case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
+ return vt.GetType()
}
return nil
}
// Returns a pointer to the underlying variant's property, if present.
-func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetEagerness() *string {
+func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetCreateResponse() *bool {
switch vt := u.any.(type) {
- case *RealtimeAudioInputTurnDetectionParam:
- return (*string)(&vt.Eagerness)
- case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
- return (*string)(&vt.Eagerness)
+ case *RealtimeAudioInputTurnDetectionUnionParam:
+ return vt.GetCreateResponse()
+ case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
+ return vt.GetCreateResponse()
}
return nil
}
@@ -1564,10 +1691,10 @@ func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetEagerness()
// Returns a pointer to the underlying variant's property, if present.
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetIdleTimeoutMs() *int64 {
switch vt := u.any.(type) {
- case *RealtimeAudioInputTurnDetectionParam:
- return paramutil.AddrIfPresent(vt.IdleTimeoutMs)
- case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
- return paramutil.AddrIfPresent(vt.IdleTimeoutMs)
+ case *RealtimeAudioInputTurnDetectionUnionParam:
+ return vt.GetIdleTimeoutMs()
+ case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
+ return vt.GetIdleTimeoutMs()
}
return nil
}
@@ -1575,10 +1702,10 @@ func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetIdleTimeout
// Returns a pointer to the underlying variant's property, if present.
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetInterruptResponse() *bool {
switch vt := u.any.(type) {
- case *RealtimeAudioInputTurnDetectionParam:
- return paramutil.AddrIfPresent(vt.InterruptResponse)
- case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
- return paramutil.AddrIfPresent(vt.InterruptResponse)
+ case *RealtimeAudioInputTurnDetectionUnionParam:
+ return vt.GetInterruptResponse()
+ case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
+ return vt.GetInterruptResponse()
}
return nil
}
@@ -1586,10 +1713,10 @@ func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetInterruptRe
// Returns a pointer to the underlying variant's property, if present.
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetPrefixPaddingMs() *int64 {
switch vt := u.any.(type) {
- case *RealtimeAudioInputTurnDetectionParam:
- return paramutil.AddrIfPresent(vt.PrefixPaddingMs)
- case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
- return paramutil.AddrIfPresent(vt.PrefixPaddingMs)
+ case *RealtimeAudioInputTurnDetectionUnionParam:
+ return vt.GetPrefixPaddingMs()
+ case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
+ return vt.GetPrefixPaddingMs()
}
return nil
}
@@ -1597,10 +1724,10 @@ func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetPrefixPaddi
// Returns a pointer to the underlying variant's property, if present.
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetSilenceDurationMs() *int64 {
switch vt := u.any.(type) {
- case *RealtimeAudioInputTurnDetectionParam:
- return paramutil.AddrIfPresent(vt.SilenceDurationMs)
- case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
- return paramutil.AddrIfPresent(vt.SilenceDurationMs)
+ case *RealtimeAudioInputTurnDetectionUnionParam:
+ return vt.GetSilenceDurationMs()
+ case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
+ return vt.GetSilenceDurationMs()
}
return nil
}
@@ -1608,21 +1735,21 @@ func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetSilenceDura
// Returns a pointer to the underlying variant's property, if present.
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetThreshold() *float64 {
switch vt := u.any.(type) {
- case *RealtimeAudioInputTurnDetectionParam:
- return paramutil.AddrIfPresent(vt.Threshold)
- case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
- return paramutil.AddrIfPresent(vt.Threshold)
+ case *RealtimeAudioInputTurnDetectionUnionParam:
+ return vt.GetThreshold()
+ case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
+ return vt.GetThreshold()
}
return nil
}
// Returns a pointer to the underlying variant's property, if present.
-func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetType() *string {
+func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetEagerness() *string {
switch vt := u.any.(type) {
- case *RealtimeAudioInputTurnDetectionParam:
- return (*string)(&vt.Type)
- case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
- return (*string)(&vt.Type)
+ case *RealtimeAudioInputTurnDetectionUnionParam:
+ return vt.GetEagerness()
+ case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
+ return vt.GetEagerness()
}
return nil
}
diff --git a/realtime/clientsecret_test.go b/realtime/clientsecret_test.go
index 37ceb80..280d3c4 100644
--- a/realtime/clientsecret_test.go
+++ b/realtime/clientsecret_test.go
@@ -51,15 +51,15 @@ func TestClientSecretNewWithOptionalParams(t *testing.T) {
Model: realtime.AudioTranscriptionModelWhisper1,
Prompt: openai.String("prompt"),
},
- TurnDetection: realtime.RealtimeAudioInputTurnDetectionParam{
- CreateResponse: openai.Bool(true),
- Eagerness: realtime.RealtimeAudioInputTurnDetectionEagernessLow,
- IdleTimeoutMs: openai.Int(0),
- InterruptResponse: openai.Bool(true),
- PrefixPaddingMs: openai.Int(0),
- SilenceDurationMs: openai.Int(0),
- Threshold: openai.Float(0),
- Type: realtime.RealtimeAudioInputTurnDetectionTypeServerVad,
+ TurnDetection: realtime.RealtimeAudioInputTurnDetectionUnionParam{
+ OfServerVad: &realtime.RealtimeAudioInputTurnDetectionServerVadParam{
+ CreateResponse: openai.Bool(true),
+ IdleTimeoutMs: openai.Int(5000),
+ InterruptResponse: openai.Bool(true),
+ PrefixPaddingMs: openai.Int(0),
+ SilenceDurationMs: openai.Int(0),
+ Threshold: openai.Float(0),
+ },
},
},
Output: realtime.RealtimeAudioConfigOutputParam{
diff --git a/realtime/realtime.go b/realtime/realtime.go
index f440e8d..a38db3c 100644
--- a/realtime/realtime.go
+++ b/realtime/realtime.go
@@ -141,6 +141,20 @@ func (r *RealtimeAudioConfigParam) UnmarshalJSON(data []byte) error {
}
type RealtimeAudioConfigInputParam struct {
+ // Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+ // set to `null` to turn off, in which case the client must manually trigger model
+ // response.
+ //
+ // Server VAD means that the model will detect the start and end of speech based on
+ // audio volume and respond at the end of user speech.
+ //
+ // Semantic VAD is more advanced and uses a turn detection model (in conjunction
+ // with VAD) to semantically estimate whether the user has finished speaking, then
+ // dynamically sets a timeout based on this probability. For example, if user audio
+ // trails off with "uhhm", the model will score a low probability of turn end and
+ // wait longer for the user to continue speaking. This can be useful for more
+ // natural conversations, but may have a higher latency.
+ TurnDetection RealtimeAudioInputTurnDetectionUnionParam `json:"turn_detection,omitzero"`
// The format of the input audio.
Format RealtimeAudioFormatsUnionParam `json:"format,omitzero"`
// Configuration for input audio noise reduction. This can be set to `null` to turn
@@ -158,17 +172,6 @@ type RealtimeAudioConfigInputParam struct {
// what the model heard. The client can optionally set the language and prompt for
// transcription, these offer additional guidance to the transcription service.
Transcription AudioTranscriptionParam `json:"transcription,omitzero"`
- // Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
- // set to `null` to turn off, in which case the client must manually trigger model
- // response. Server VAD means that the model will detect the start and end of
- // speech based on audio volume and respond at the end of user speech. Semantic VAD
- // is more advanced and uses a turn detection model (in conjunction with VAD) to
- // semantically estimate whether the user has finished speaking, then dynamically
- // sets a timeout based on this probability. For example, if user audio trails off
- // with "uhhm", the model will score a low probability of turn end and wait longer
- // for the user to continue speaking. This can be useful for more natural
- // conversations, but may have a higher latency.
- TurnDetection RealtimeAudioInputTurnDetectionParam `json:"turn_detection,omitzero"`
paramObj
}
@@ -530,19 +533,126 @@ func init() {
)
}
-// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
-// set to `null` to turn off, in which case the client must manually trigger model
-// response. Server VAD means that the model will detect the start and end of
-// speech based on audio volume and respond at the end of user speech. Semantic VAD
-// is more advanced and uses a turn detection model (in conjunction with VAD) to
-// semantically estimate whether the user has finished speaking, then dynamically
-// sets a timeout based on this probability. For example, if user audio trails off
-// with "uhhm", the model will score a low probability of turn end and wait longer
-// for the user to continue speaking. This can be useful for more natural
-// conversations, but may have a higher latency.
-type RealtimeAudioInputTurnDetectionParam struct {
- // Optional idle timeout after which turn detection will auto-timeout when no
- // additional audio is received and emits a `timeout_triggered` event.
+// Only one field can be non-zero.
+//
+// Use [param.IsOmitted] to confirm if a field is set.
+type RealtimeAudioInputTurnDetectionUnionParam struct {
+ OfServerVad *RealtimeAudioInputTurnDetectionServerVadParam `json:",omitzero,inline"`
+ OfSemanticVad *RealtimeAudioInputTurnDetectionSemanticVadParam `json:",omitzero,inline"`
+ paramUnion
+}
+
+func (u RealtimeAudioInputTurnDetectionUnionParam) MarshalJSON() ([]byte, error) {
+ return param.MarshalUnion(u, u.OfServerVad, u.OfSemanticVad)
+}
+func (u *RealtimeAudioInputTurnDetectionUnionParam) UnmarshalJSON(data []byte) error {
+ return apijson.UnmarshalRoot(data, u)
+}
+
+func (u *RealtimeAudioInputTurnDetectionUnionParam) asAny() any {
+ if !param.IsOmitted(u.OfServerVad) {
+ return u.OfServerVad
+ } else if !param.IsOmitted(u.OfSemanticVad) {
+ return u.OfSemanticVad
+ }
+ return nil
+}
+
+// Returns a pointer to the underlying variant's property, if present.
+func (u RealtimeAudioInputTurnDetectionUnionParam) GetIdleTimeoutMs() *int64 {
+ if vt := u.OfServerVad; vt != nil && vt.IdleTimeoutMs.Valid() {
+ return &vt.IdleTimeoutMs.Value
+ }
+ return nil
+}
+
+// Returns a pointer to the underlying variant's property, if present.
+func (u RealtimeAudioInputTurnDetectionUnionParam) GetPrefixPaddingMs() *int64 {
+ if vt := u.OfServerVad; vt != nil && vt.PrefixPaddingMs.Valid() {
+ return &vt.PrefixPaddingMs.Value
+ }
+ return nil
+}
+
+// Returns a pointer to the underlying variant's property, if present.
+func (u RealtimeAudioInputTurnDetectionUnionParam) GetSilenceDurationMs() *int64 {
+ if vt := u.OfServerVad; vt != nil && vt.SilenceDurationMs.Valid() {
+ return &vt.SilenceDurationMs.Value
+ }
+ return nil
+}
+
+// Returns a pointer to the underlying variant's property, if present.
+func (u RealtimeAudioInputTurnDetectionUnionParam) GetThreshold() *float64 {
+ if vt := u.OfServerVad; vt != nil && vt.Threshold.Valid() {
+ return &vt.Threshold.Value
+ }
+ return nil
+}
+
+// Returns a pointer to the underlying variant's property, if present.
+func (u RealtimeAudioInputTurnDetectionUnionParam) GetEagerness() *string {
+ if vt := u.OfSemanticVad; vt != nil {
+ return &vt.Eagerness
+ }
+ return nil
+}
+
+// Returns a pointer to the underlying variant's property, if present.
+func (u RealtimeAudioInputTurnDetectionUnionParam) GetType() *string {
+ if vt := u.OfServerVad; vt != nil {
+ return (*string)(&vt.Type)
+ } else if vt := u.OfSemanticVad; vt != nil {
+ return (*string)(&vt.Type)
+ }
+ return nil
+}
+
+// Returns a pointer to the underlying variant's property, if present.
+func (u RealtimeAudioInputTurnDetectionUnionParam) GetCreateResponse() *bool {
+ if vt := u.OfServerVad; vt != nil && vt.CreateResponse.Valid() {
+ return &vt.CreateResponse.Value
+ } else if vt := u.OfSemanticVad; vt != nil && vt.CreateResponse.Valid() {
+ return &vt.CreateResponse.Value
+ }
+ return nil
+}
+
+// Returns a pointer to the underlying variant's property, if present.
+func (u RealtimeAudioInputTurnDetectionUnionParam) GetInterruptResponse() *bool {
+ if vt := u.OfServerVad; vt != nil && vt.InterruptResponse.Valid() {
+ return &vt.InterruptResponse.Value
+ } else if vt := u.OfSemanticVad; vt != nil && vt.InterruptResponse.Valid() {
+ return &vt.InterruptResponse.Value
+ }
+ return nil
+}
+
+func init() {
+ apijson.RegisterUnion[RealtimeAudioInputTurnDetectionUnionParam](
+ "type",
+ apijson.Discriminator[RealtimeAudioInputTurnDetectionServerVadParam]("server_vad"),
+ apijson.Discriminator[RealtimeAudioInputTurnDetectionSemanticVadParam]("semantic_vad"),
+ )
+}
+
+// Server-side voice activity detection (VAD) which flips on when user speech is
+// detected and off after a period of silence.
+//
+// The property Type is required.
+type RealtimeAudioInputTurnDetectionServerVadParam struct {
+ // Optional timeout after which a model response will be triggered automatically.
+ // This is useful for situations in which a long pause from the user is unexpected,
+ // such as a phone call. The model will effectively prompt the user to continue the
+ // conversation based on the current context.
+ //
+ // The timeout value will be applied after the last model response's audio has
+ // finished playing, i.e. it's set to the `response.done` time plus audio playback
+ // duration.
+ //
+ // An `input_audio_buffer.timeout_triggered` event (plus events associated with the
+ // Response) will be emitted when the timeout is reached. Idle timeout is currently
+ // only supported for `server_vad` mode.
IdleTimeoutMs param.Opt[int64] `json:"idle_timeout_ms,omitzero"`
// Whether or not to automatically generate a response when a VAD stop event
// occurs.
@@ -562,48 +672,60 @@ type RealtimeAudioInputTurnDetectionParam struct {
// defaults to 0.5. A higher threshold will require louder audio to activate the
// model, and thus might perform better in noisy environments.
Threshold param.Opt[float64] `json:"threshold,omitzero"`
+ // Type of turn detection, `server_vad` to turn on simple Server VAD.
+ //
+ // This field can be elided, and will marshal its zero value as "server_vad".
+ Type constant.ServerVad `json:"type,required"`
+ paramObj
+}
+
+func (r RealtimeAudioInputTurnDetectionServerVadParam) MarshalJSON() (data []byte, err error) {
+ type shadow RealtimeAudioInputTurnDetectionServerVadParam
+ return param.MarshalObject(r, (*shadow)(&r))
+}
+func (r *RealtimeAudioInputTurnDetectionServerVadParam) UnmarshalJSON(data []byte) error {
+ return apijson.UnmarshalRoot(data, r)
+}
+
+// Server-side semantic turn detection which uses a model to determine when the
+// user has finished speaking.
+//
+// The property Type is required.
+type RealtimeAudioInputTurnDetectionSemanticVadParam struct {
+ // Whether or not to automatically generate a response when a VAD stop event
+ // occurs.
+ CreateResponse param.Opt[bool] `json:"create_response,omitzero"`
+ // Whether or not to automatically interrupt any ongoing response with output to
+ // the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+ // occurs.
+ InterruptResponse param.Opt[bool] `json:"interrupt_response,omitzero"`
// Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
// will wait longer for the user to continue speaking, `high` will respond more
// quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
// and `high` have max timeouts of 8s, 4s, and 2s respectively.
//
// Any of "low", "medium", "high", "auto".
- Eagerness RealtimeAudioInputTurnDetectionEagerness `json:"eagerness,omitzero"`
- // Type of turn detection.
+ Eagerness string `json:"eagerness,omitzero"`
+ // Type of turn detection, `semantic_vad` to turn on Semantic VAD.
//
- // Any of "server_vad", "semantic_vad".
- Type RealtimeAudioInputTurnDetectionType `json:"type,omitzero"`
+ // This field can be elided, and will marshal its zero value as "semantic_vad".
+ Type constant.SemanticVad `json:"type,required"`
paramObj
}
-func (r RealtimeAudioInputTurnDetectionParam) MarshalJSON() (data []byte, err error) {
- type shadow RealtimeAudioInputTurnDetectionParam
+func (r RealtimeAudioInputTurnDetectionSemanticVadParam) MarshalJSON() (data []byte, err error) {
+ type shadow RealtimeAudioInputTurnDetectionSemanticVadParam
return param.MarshalObject(r, (*shadow)(&r))
}
-func (r *RealtimeAudioInputTurnDetectionParam) UnmarshalJSON(data []byte) error {
+func (r *RealtimeAudioInputTurnDetectionSemanticVadParam) UnmarshalJSON(data []byte) error {
return apijson.UnmarshalRoot(data, r)
}
-// Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-// will wait longer for the user to continue speaking, `high` will respond more
-// quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
-// and `high` have max timeouts of 8s, 4s, and 2s respectively.
-type RealtimeAudioInputTurnDetectionEagerness string
-
-const (
- RealtimeAudioInputTurnDetectionEagernessLow RealtimeAudioInputTurnDetectionEagerness = "low"
- RealtimeAudioInputTurnDetectionEagernessMedium RealtimeAudioInputTurnDetectionEagerness = "medium"
- RealtimeAudioInputTurnDetectionEagernessHigh RealtimeAudioInputTurnDetectionEagerness = "high"
- RealtimeAudioInputTurnDetectionEagernessAuto RealtimeAudioInputTurnDetectionEagerness = "auto"
-)
-
-// Type of turn detection.
-type RealtimeAudioInputTurnDetectionType string
-
-const (
- RealtimeAudioInputTurnDetectionTypeServerVad RealtimeAudioInputTurnDetectionType = "server_vad"
- RealtimeAudioInputTurnDetectionTypeSemanticVad RealtimeAudioInputTurnDetectionType = "semantic_vad"
-)
+func init() {
+ apijson.RegisterFieldValidator[RealtimeAudioInputTurnDetectionSemanticVadParam](
+ "eagerness", "low", "medium", "high", "auto",
+ )
+}
type RealtimeFunctionTool struct {
// The description of the function, including guidance on when and how to call it,
@@ -1264,6 +1386,20 @@ func (r *RealtimeTranscriptionSessionAudioParam) UnmarshalJSON(data []byte) erro
}
type RealtimeTranscriptionSessionAudioInputParam struct {
+ // Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
+ // set to `null` to turn off, in which case the client must manually trigger model
+ // response.
+ //
+ // Server VAD means that the model will detect the start and end of speech based on
+ // audio volume and respond at the end of user speech.
+ //
+ // Semantic VAD is more advanced and uses a turn detection model (in conjunction
+ // with VAD) to semantically estimate whether the user has finished speaking, then
+ // dynamically sets a timeout based on this probability. For example, if user audio
+ // trails off with "uhhm", the model will score a low probability of turn end and
+ // wait longer for the user to continue speaking. This can be useful for more
+ // natural conversations, but may have a higher latency.
+ TurnDetection RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam `json:"turn_detection,omitzero"`
// The PCM audio format. Only a 24kHz sample rate is supported.
Format RealtimeAudioFormatsUnionParam `json:"format,omitzero"`
// Configuration for input audio noise reduction. This can be set to `null` to turn
@@ -1281,17 +1417,6 @@ type RealtimeTranscriptionSessionAudioInputParam struct {
// what the model heard. The client can optionally set the language and prompt for
// transcription, these offer additional guidance to the transcription service.
Transcription AudioTranscriptionParam `json:"transcription,omitzero"`
- // Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
- // set to `null` to turn off, in which case the client must manually trigger model
- // response. Server VAD means that the model will detect the start and end of
- // speech based on audio volume and respond at the end of user speech. Semantic VAD
- // is more advanced and uses a turn detection model (in conjunction with VAD) to
- // semantically estimate whether the user has finished speaking, then dynamically
- // sets a timeout based on this probability. For example, if user audio trails off
- // with "uhhm", the model will score a low probability of turn end and wait longer
- // for the user to continue speaking. This can be useful for more natural
- // conversations, but may have a higher latency.
- TurnDetection RealtimeTranscriptionSessionAudioInputTurnDetectionParam `json:"turn_detection,omitzero"`
paramObj
}
@@ -1326,19 +1451,126 @@ func (r *RealtimeTranscriptionSessionAudioInputNoiseReductionParam) UnmarshalJSO
return apijson.UnmarshalRoot(data, r)
}
-// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
-// set to `null` to turn off, in which case the client must manually trigger model
-// response. Server VAD means that the model will detect the start and end of
-// speech based on audio volume and respond at the end of user speech. Semantic VAD
-// is more advanced and uses a turn detection model (in conjunction with VAD) to
-// semantically estimate whether the user has finished speaking, then dynamically
-// sets a timeout based on this probability. For example, if user audio trails off
-// with "uhhm", the model will score a low probability of turn end and wait longer
-// for the user to continue speaking. This can be useful for more natural
-// conversations, but may have a higher latency.
-type RealtimeTranscriptionSessionAudioInputTurnDetectionParam struct {
- // Optional idle timeout after which turn detection will auto-timeout when no
- // additional audio is received.
+// Only one field can be non-zero.
+//
+// Use [param.IsOmitted] to confirm if a field is set.
+type RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam struct {
+ OfServerVad *RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam `json:",omitzero,inline"`
+ OfSemanticVad *RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam `json:",omitzero,inline"`
+ paramUnion
+}
+
+func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) MarshalJSON() ([]byte, error) {
+ return param.MarshalUnion(u, u.OfServerVad, u.OfSemanticVad)
+}
+func (u *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) UnmarshalJSON(data []byte) error {
+ return apijson.UnmarshalRoot(data, u)
+}
+
+func (u *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) asAny() any {
+ if !param.IsOmitted(u.OfServerVad) {
+ return u.OfServerVad
+ } else if !param.IsOmitted(u.OfSemanticVad) {
+ return u.OfSemanticVad
+ }
+ return nil
+}
+
+// Returns a pointer to the underlying variant's property, if present.
+func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetIdleTimeoutMs() *int64 {
+ if vt := u.OfServerVad; vt != nil && vt.IdleTimeoutMs.Valid() {
+ return &vt.IdleTimeoutMs.Value
+ }
+ return nil
+}
+
+// Returns a pointer to the underlying variant's property, if present.
+func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetPrefixPaddingMs() *int64 {
+ if vt := u.OfServerVad; vt != nil && vt.PrefixPaddingMs.Valid() {
+ return &vt.PrefixPaddingMs.Value
+ }
+ return nil
+}
+
+// Returns a pointer to the underlying variant's property, if present.
+func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetSilenceDurationMs() *int64 {
+ if vt := u.OfServerVad; vt != nil && vt.SilenceDurationMs.Valid() {
+ return &vt.SilenceDurationMs.Value
+ }
+ return nil
+}
+
+// Returns a pointer to the underlying variant's property, if present.
+func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetThreshold() *float64 {
+ if vt := u.OfServerVad; vt != nil && vt.Threshold.Valid() {
+ return &vt.Threshold.Value
+ }
+ return nil
+}
+
+// Returns a pointer to the underlying variant's property, if present.
+func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetEagerness() *string {
+ if vt := u.OfSemanticVad; vt != nil {
+ return &vt.Eagerness
+ }
+ return nil
+}
+
+// Returns a pointer to the underlying variant's property, if present.
+func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetType() *string {
+ if vt := u.OfServerVad; vt != nil {
+ return (*string)(&vt.Type)
+ } else if vt := u.OfSemanticVad; vt != nil {
+ return (*string)(&vt.Type)
+ }
+ return nil
+}
+
+// Returns a pointer to the underlying variant's property, if present.
+func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetCreateResponse() *bool {
+ if vt := u.OfServerVad; vt != nil && vt.CreateResponse.Valid() {
+ return &vt.CreateResponse.Value
+ } else if vt := u.OfSemanticVad; vt != nil && vt.CreateResponse.Valid() {
+ return &vt.CreateResponse.Value
+ }
+ return nil
+}
+
+// Returns a pointer to the underlying variant's property, if present.
+func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetInterruptResponse() *bool {
+ if vt := u.OfServerVad; vt != nil && vt.InterruptResponse.Valid() {
+ return &vt.InterruptResponse.Value
+ } else if vt := u.OfSemanticVad; vt != nil && vt.InterruptResponse.Valid() {
+ return &vt.InterruptResponse.Value
+ }
+ return nil
+}
+
+func init() {
+ apijson.RegisterUnion[RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam](
+ "type",
+ apijson.Discriminator[RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam]("server_vad"),
+ apijson.Discriminator[RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam]("semantic_vad"),
+ )
+}
+
+// Server-side voice activity detection (VAD) which flips on when user speech is
+// detected and off after a period of silence.
+//
+// The property Type is required.
+type RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam struct {
+ // Optional timeout after which a model response will be triggered automatically.
+ // This is useful for situations in which a long pause from the user is unexpected,
+ // such as a phone call. The model will effectively prompt the user to continue the
+ // conversation based on the current context.
+ //
+ // The timeout value will be applied after the last model response's audio has
+ // finished playing, i.e. it's set to the `response.done` time plus audio playback
+ // duration.
+ //
+ // An `input_audio_buffer.timeout_triggered` event (plus events associated with the
+ // Response) will be emitted when the timeout is reached. Idle timeout is currently
+ // only supported for `server_vad` mode.
IdleTimeoutMs param.Opt[int64] `json:"idle_timeout_ms,omitzero"`
// Whether or not to automatically generate a response when a VAD stop event
// occurs.
@@ -1358,46 +1590,60 @@ type RealtimeTranscriptionSessionAudioInputTurnDetectionParam struct {
// defaults to 0.5. A higher threshold will require louder audio to activate the
// model, and thus might perform better in noisy environments.
Threshold param.Opt[float64] `json:"threshold,omitzero"`
- // Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
- // will wait longer for the user to continue speaking, `high` will respond more
- // quickly. `auto` is the default and is equivalent to `medium`.
+ // Type of turn detection, `server_vad` to turn on simple Server VAD.
//
- // Any of "low", "medium", "high", "auto".
- Eagerness RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness `json:"eagerness,omitzero"`
- // Type of turn detection.
- //
- // Any of "server_vad", "semantic_vad".
- Type RealtimeTranscriptionSessionAudioInputTurnDetectionType `json:"type,omitzero"`
+ // This field can be elided, and will marshal its zero value as "server_vad".
+ Type constant.ServerVad `json:"type,required"`
paramObj
}
-func (r RealtimeTranscriptionSessionAudioInputTurnDetectionParam) MarshalJSON() (data []byte, err error) {
- type shadow RealtimeTranscriptionSessionAudioInputTurnDetectionParam
+func (r RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam) MarshalJSON() (data []byte, err error) {
+ type shadow RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam
return param.MarshalObject(r, (*shadow)(&r))
}
-func (r *RealtimeTranscriptionSessionAudioInputTurnDetectionParam) UnmarshalJSON(data []byte) error {
+func (r *RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam) UnmarshalJSON(data []byte) error {
return apijson.UnmarshalRoot(data, r)
}
-// Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
-// will wait longer for the user to continue speaking, `high` will respond more
-// quickly. `auto` is the default and is equivalent to `medium`.
-type RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness string
+// Server-side semantic turn detection which uses a model to determine when the
+// user has finished speaking.
+//
+// The property Type is required.
+type RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam struct {
+ // Whether or not to automatically generate a response when a VAD stop event
+ // occurs.
+ CreateResponse param.Opt[bool] `json:"create_response,omitzero"`
+ // Whether or not to automatically interrupt any ongoing response with output to
+ // the default conversation (i.e. `conversation` of `auto`) when a VAD start event
+ // occurs.
+ InterruptResponse param.Opt[bool] `json:"interrupt_response,omitzero"`
+ // Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
+ // will wait longer for the user to continue speaking, `high` will respond more
+ // quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
+ // and `high` have max timeouts of 8s, 4s, and 2s respectively.
+ //
+ // Any of "low", "medium", "high", "auto".
+ Eagerness string `json:"eagerness,omitzero"`
+ // Type of turn detection, `semantic_vad` to turn on Semantic VAD.
+ //
+ // This field can be elided, and will marshal its zero value as "semantic_vad".
+ Type constant.SemanticVad `json:"type,required"`
+ paramObj
+}
-const (
- RealtimeTranscriptionSessionAudioInputTurnDetectionEagernessLow RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness = "low"
- RealtimeTranscriptionSessionAudioInputTurnDetectionEagernessMedium RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness = "medium"
- RealtimeTranscriptionSessionAudioInputTurnDetectionEagernessHigh RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness = "high"
- RealtimeTranscriptionSessionAudioInputTurnDetectionEagernessAuto RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness = "auto"
-)
+func (r RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam) MarshalJSON() (data []byte, err error) {
+ type shadow RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam
+ return param.MarshalObject(r, (*shadow)(&r))
+}
+func (r *RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam) UnmarshalJSON(data []byte) error {
+ return apijson.UnmarshalRoot(data, r)
+}
-// Type of turn detection.
-type RealtimeTranscriptionSessionAudioInputTurnDetectionType string
-
-const (
- RealtimeTranscriptionSessionAudioInputTurnDetectionTypeServerVad RealtimeTranscriptionSessionAudioInputTurnDetectionType = "server_vad"
- RealtimeTranscriptionSessionAudioInputTurnDetectionTypeSemanticVad RealtimeTranscriptionSessionAudioInputTurnDetectionType = "semantic_vad"
-)
+func init() {
+ apijson.RegisterFieldValidator[RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam](
+ "eagerness", "low", "medium", "high", "auto",
+ )
+}
// Realtime transcription session object configuration.
//
diff --git a/responses/response.go b/responses/response.go
index 2eb655e..e3b20d5 100644
--- a/responses/response.go
+++ b/responses/response.go
@@ -879,10 +879,10 @@ type Response struct {
TopLogprobs int64 `json:"top_logprobs,nullable"`
// The truncation strategy to use for the model response.
//
- // - `auto`: If the context of this response and previous ones exceeds the model's
- // context window size, the model will truncate the response to fit the context
- // window by dropping input items in the middle of the conversation.
- // - `disabled` (default): If a model response will exceed the context window size
+ // - `auto`: If the input to this Response exceeds the model's context window size,
+ // the model will truncate the response to fit the context window by dropping
+ // items from the beginning of the conversation.
+ // - `disabled` (default): If the input size will exceed the context window size
// for a model, the request will fail with a 400 error.
//
// Any of "auto", "disabled".
@@ -1125,10 +1125,10 @@ const (
// The truncation strategy to use for the model response.
//
-// - `auto`: If the context of this response and previous ones exceeds the model's
-// context window size, the model will truncate the response to fit the context
-// window by dropping input items in the middle of the conversation.
-// - `disabled` (default): If a model response will exceed the context window size
+// - `auto`: If the input to this Response exceeds the model's context window size,
+// the model will truncate the response to fit the context window by dropping
+// items from the beginning of the conversation.
+// - `disabled` (default): If the input size will exceed the context window size
// for a model, the request will fail with a 400 error.
type ResponseTruncation string
@@ -14285,10 +14285,10 @@ type ResponseNewParams struct {
StreamOptions ResponseNewParamsStreamOptions `json:"stream_options,omitzero"`
// The truncation strategy to use for the model response.
//
- // - `auto`: If the context of this response and previous ones exceeds the model's
- // context window size, the model will truncate the response to fit the context
- // window by dropping input items in the middle of the conversation.
- // - `disabled` (default): If a model response will exceed the context window size
+ // - `auto`: If the input to this Response exceeds the model's context window size,
+ // the model will truncate the response to fit the context window by dropping
+ // items from the beginning of the conversation.
+ // - `disabled` (default): If the input size will exceed the context window size
// for a model, the request will fail with a 400 error.
//
// Any of "auto", "disabled".
@@ -14548,10 +14548,10 @@ func (u ResponseNewParamsToolChoiceUnion) GetName() *string {
// The truncation strategy to use for the model response.
//
-// - `auto`: If the context of this response and previous ones exceeds the model's
-// context window size, the model will truncate the response to fit the context
-// window by dropping input items in the middle of the conversation.
-// - `disabled` (default): If a model response will exceed the context window size
+// - `auto`: If the input to this Response exceeds the model's context window size,
+// the model will truncate the response to fit the context window by dropping
+// items from the beginning of the conversation.
+// - `disabled` (default): If the input size will exceed the context window size
// for a model, the request will fail with a 400 error.
type ResponseNewParamsTruncation string
diff --git a/shared/constant/constants.go b/shared/constant/constants.go
index c2b547a..de5e32a 100644
--- a/shared/constant/constants.go
+++ b/shared/constant/constants.go
@@ -217,6 +217,8 @@ type ScoreModel string // Always "score_mo
type Screenshot string // Always "screenshot"
type Scroll string // Always "scroll"
type Search string // Always "search"
+type SemanticVad string // Always "semantic_vad"
+type ServerVad string // Always "server_vad"
type SessionCreated string // Always "session.created"
type SessionUpdate string // Always "session.update"
type SessionUpdated string // Always "session.updated"
@@ -610,6 +612,8 @@ func (c ScoreModel) Default() ScoreModel { return "score
func (c Screenshot) Default() Screenshot { return "screenshot" }
func (c Scroll) Default() Scroll { return "scroll" }
func (c Search) Default() Search { return "search" }
+func (c SemanticVad) Default() SemanticVad { return "semantic_vad" }
+func (c ServerVad) Default() ServerVad { return "server_vad" }
func (c SessionCreated) Default() SessionCreated { return "session.created" }
func (c SessionUpdate) Default() SessionUpdate { return "session.update" }
func (c SessionUpdated) Default() SessionUpdated { return "session.updated" }
@@ -903,6 +907,8 @@ func (c ScoreModel) MarshalJSON() ([]byte, error) { retu
func (c Screenshot) MarshalJSON() ([]byte, error) { return marshalString(c) }
func (c Scroll) MarshalJSON() ([]byte, error) { return marshalString(c) }
func (c Search) MarshalJSON() ([]byte, error) { return marshalString(c) }
+func (c SemanticVad) MarshalJSON() ([]byte, error) { return marshalString(c) }
+func (c ServerVad) MarshalJSON() ([]byte, error) { return marshalString(c) }
func (c SessionCreated) MarshalJSON() ([]byte, error) { return marshalString(c) }
func (c SessionUpdate) MarshalJSON() ([]byte, error) { return marshalString(c) }
func (c SessionUpdated) MarshalJSON() ([]byte, error) { return marshalString(c) }