From d92ea4850f3720ba7a372f7bc9f8ecff07392ba0 Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Thu, 11 Sep 2025 18:03:52 +0000 Subject: [PATCH] chore(api): Minor docs and type updates for realtime --- .stats.yml | 4 +- api.md | 4 +- realtime/clientsecret.go | 313 ++++++++++++++++------- realtime/clientsecret_test.go | 18 +- realtime/realtime.go | 454 ++++++++++++++++++++++++++-------- responses/response.go | 32 +-- shared/constant/constants.go | 6 + 7 files changed, 605 insertions(+), 226 deletions(-) diff --git a/.stats.yml b/.stats.yml index 03c3e4c..1561159 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ configured_endpoints: 106 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-16cb18bed32bae8c5840fb39a1bf664026cc40463ad0c487dcb0df1bd3d72db0.yml -openapi_spec_hash: 4cb51b22f98dee1a90bc7add82d1d132 +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-c829f9e7f51d4946dae7b02eb37eb857b538a464cf54c7ced5eff1b1c93e07db.yml +openapi_spec_hash: 1b2eaba46b264bcec8831bc496543649 config_hash: 930dac3aa861344867e4ac84f037b5df diff --git a/api.md b/api.md index 6fad482..74d733d 100644 --- a/api.md +++ b/api.md @@ -793,7 +793,7 @@ Params Types: - realtime.RealtimeAudioConfigInputParam - realtime.RealtimeAudioConfigOutputParam - realtime.RealtimeAudioFormatsUnionParam -- realtime.RealtimeAudioInputTurnDetectionParam +- realtime.RealtimeAudioInputTurnDetectionUnionParam - realtime.RealtimeFunctionToolParam - realtime.RealtimeSessionCreateRequestParam - realtime.RealtimeToolChoiceConfigUnionParam @@ -802,7 +802,7 @@ Params Types: - realtime.RealtimeTracingConfigUnionParam - realtime.RealtimeTranscriptionSessionAudioParam - realtime.RealtimeTranscriptionSessionAudioInputParam -- realtime.RealtimeTranscriptionSessionAudioInputTurnDetectionParam +- realtime.RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam - realtime.RealtimeTranscriptionSessionCreateRequestParam - realtime.RealtimeTruncationUnionParam - realtime.RealtimeTruncationRetentionRatioParam diff --git a/realtime/clientsecret.go b/realtime/clientsecret.go index ba1dc14..cb9e36b 100644 --- a/realtime/clientsecret.go +++ b/realtime/clientsecret.go @@ -8,7 +8,6 @@ import ( "net/http" "github.com/openai/openai-go/v2/internal/apijson" - "github.com/openai/openai-go/v2/internal/paramutil" "github.com/openai/openai-go/v2/internal/requestconfig" "github.com/openai/openai-go/v2/option" "github.com/openai/openai-go/v2/packages/param" @@ -192,15 +191,18 @@ type RealtimeSessionCreateResponseAudioInput struct { Transcription AudioTranscription `json:"transcription"` // Configuration for turn detection, ether Server VAD or Semantic VAD. This can be // set to `null` to turn off, in which case the client must manually trigger model - // response. Server VAD means that the model will detect the start and end of - // speech based on audio volume and respond at the end of user speech. Semantic VAD - // is more advanced and uses a turn detection model (in conjunction with VAD) to - // semantically estimate whether the user has finished speaking, then dynamically - // sets a timeout based on this probability. For example, if user audio trails off - // with "uhhm", the model will score a low probability of turn end and wait longer - // for the user to continue speaking. This can be useful for more natural - // conversations, but may have a higher latency. - TurnDetection RealtimeSessionCreateResponseAudioInputTurnDetection `json:"turn_detection"` + // response. + // + // Server VAD means that the model will detect the start and end of speech based on + // audio volume and respond at the end of user speech. + // + // Semantic VAD is more advanced and uses a turn detection model (in conjunction + // with VAD) to semantically estimate whether the user has finished speaking, then + // dynamically sets a timeout based on this probability. For example, if user audio + // trails off with "uhhm", the model will score a low probability of turn end and + // wait longer for the user to continue speaking. This can be useful for more + // natural conversations, but may have a higher latency. + TurnDetection RealtimeSessionCreateResponseAudioInputTurnDetectionUnion `json:"turn_detection,nullable"` // JSON contains metadata for fields, check presence with [respjson.Field.Valid]. JSON struct { Format respjson.Field @@ -244,29 +246,118 @@ func (r *RealtimeSessionCreateResponseAudioInputNoiseReduction) UnmarshalJSON(da return apijson.UnmarshalRoot(data, r) } -// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be -// set to `null` to turn off, in which case the client must manually trigger model -// response. Server VAD means that the model will detect the start and end of -// speech based on audio volume and respond at the end of user speech. Semantic VAD -// is more advanced and uses a turn detection model (in conjunction with VAD) to -// semantically estimate whether the user has finished speaking, then dynamically -// sets a timeout based on this probability. For example, if user audio trails off -// with "uhhm", the model will score a low probability of turn end and wait longer -// for the user to continue speaking. This can be useful for more natural -// conversations, but may have a higher latency. -type RealtimeSessionCreateResponseAudioInputTurnDetection struct { +// RealtimeSessionCreateResponseAudioInputTurnDetectionUnion contains all possible +// properties and values from +// [RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad], +// [RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad]. +// +// Use the [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion.AsAny] method +// to switch on the variant. +// +// Use the methods beginning with 'As' to cast the union to one of its variants. +type RealtimeSessionCreateResponseAudioInputTurnDetectionUnion struct { + // Any of "server_vad", "semantic_vad". + Type string `json:"type"` + CreateResponse bool `json:"create_response"` + // This field is from variant + // [RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad]. + IdleTimeoutMs int64 `json:"idle_timeout_ms"` + InterruptResponse bool `json:"interrupt_response"` + // This field is from variant + // [RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad]. + PrefixPaddingMs int64 `json:"prefix_padding_ms"` + // This field is from variant + // [RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad]. + SilenceDurationMs int64 `json:"silence_duration_ms"` + // This field is from variant + // [RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad]. + Threshold float64 `json:"threshold"` + // This field is from variant + // [RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad]. + Eagerness string `json:"eagerness"` + JSON struct { + Type respjson.Field + CreateResponse respjson.Field + IdleTimeoutMs respjson.Field + InterruptResponse respjson.Field + PrefixPaddingMs respjson.Field + SilenceDurationMs respjson.Field + Threshold respjson.Field + Eagerness respjson.Field + raw string + } `json:"-"` +} + +// anyRealtimeSessionCreateResponseAudioInputTurnDetection is implemented by each +// variant of [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion] to add +// type safety for the return type of +// [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion.AsAny] +type anyRealtimeSessionCreateResponseAudioInputTurnDetection interface { + implRealtimeSessionCreateResponseAudioInputTurnDetectionUnion() +} + +func (RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad) implRealtimeSessionCreateResponseAudioInputTurnDetectionUnion() { +} +func (RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad) implRealtimeSessionCreateResponseAudioInputTurnDetectionUnion() { +} + +// Use the following switch statement to find the correct variant +// +// switch variant := RealtimeSessionCreateResponseAudioInputTurnDetectionUnion.AsAny().(type) { +// case realtime.RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad: +// case realtime.RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad: +// default: +// fmt.Errorf("no variant present") +// } +func (u RealtimeSessionCreateResponseAudioInputTurnDetectionUnion) AsAny() anyRealtimeSessionCreateResponseAudioInputTurnDetection { + switch u.Type { + case "server_vad": + return u.AsServerVad() + case "semantic_vad": + return u.AsSemanticVad() + } + return nil +} + +func (u RealtimeSessionCreateResponseAudioInputTurnDetectionUnion) AsServerVad() (v RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad) { + apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v) + return +} + +func (u RealtimeSessionCreateResponseAudioInputTurnDetectionUnion) AsSemanticVad() (v RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad) { + apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v) + return +} + +// Returns the unmodified JSON received from the API +func (u RealtimeSessionCreateResponseAudioInputTurnDetectionUnion) RawJSON() string { + return u.JSON.raw +} + +func (r *RealtimeSessionCreateResponseAudioInputTurnDetectionUnion) UnmarshalJSON(data []byte) error { + return apijson.UnmarshalRoot(data, r) +} + +// Server-side voice activity detection (VAD) which flips on when user speech is +// detected and off after a period of silence. +type RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad struct { + // Type of turn detection, `server_vad` to turn on simple Server VAD. + Type constant.ServerVad `json:"type,required"` // Whether or not to automatically generate a response when a VAD stop event // occurs. CreateResponse bool `json:"create_response"` - // Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - // will wait longer for the user to continue speaking, `high` will respond more - // quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, - // and `high` have max timeouts of 8s, 4s, and 2s respectively. + // Optional timeout after which a model response will be triggered automatically. + // This is useful for situations in which a long pause from the user is unexpected, + // such as a phone call. The model will effectively prompt the user to continue the + // conversation based on the current context. // - // Any of "low", "medium", "high", "auto". - Eagerness string `json:"eagerness"` - // Optional idle timeout after which turn detection will auto-timeout when no - // additional audio is received and emits a `timeout_triggered` event. + // The timeout value will be applied after the last model response's audio has + // finished playing, i.e. it's set to the `response.done` time plus audio playback + // duration. + // + // An `input_audio_buffer.timeout_triggered` event (plus events associated with the + // Response) will be emitted when the timeout is reached. Idle timeout is currently + // only supported for `server_vad` mode. IdleTimeoutMs int64 `json:"idle_timeout_ms,nullable"` // Whether or not to automatically interrupt any ongoing response with output to // the default conversation (i.e. `conversation` of `auto`) when a VAD start event @@ -283,28 +374,63 @@ type RealtimeSessionCreateResponseAudioInputTurnDetection struct { // defaults to 0.5. A higher threshold will require louder audio to activate the // model, and thus might perform better in noisy environments. Threshold float64 `json:"threshold"` - // Type of turn detection. - // - // Any of "server_vad", "semantic_vad". - Type string `json:"type"` // JSON contains metadata for fields, check presence with [respjson.Field.Valid]. JSON struct { + Type respjson.Field CreateResponse respjson.Field - Eagerness respjson.Field IdleTimeoutMs respjson.Field InterruptResponse respjson.Field PrefixPaddingMs respjson.Field SilenceDurationMs respjson.Field Threshold respjson.Field - Type respjson.Field ExtraFields map[string]respjson.Field raw string } `json:"-"` } // Returns the unmodified JSON received from the API -func (r RealtimeSessionCreateResponseAudioInputTurnDetection) RawJSON() string { return r.JSON.raw } -func (r *RealtimeSessionCreateResponseAudioInputTurnDetection) UnmarshalJSON(data []byte) error { +func (r RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad) RawJSON() string { + return r.JSON.raw +} +func (r *RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad) UnmarshalJSON(data []byte) error { + return apijson.UnmarshalRoot(data, r) +} + +// Server-side semantic turn detection which uses a model to determine when the +// user has finished speaking. +type RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad struct { + // Type of turn detection, `semantic_vad` to turn on Semantic VAD. + Type constant.SemanticVad `json:"type,required"` + // Whether or not to automatically generate a response when a VAD stop event + // occurs. + CreateResponse bool `json:"create_response"` + // Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + // will wait longer for the user to continue speaking, `high` will respond more + // quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + // and `high` have max timeouts of 8s, 4s, and 2s respectively. + // + // Any of "low", "medium", "high", "auto". + Eagerness string `json:"eagerness"` + // Whether or not to automatically interrupt any ongoing response with output to + // the default conversation (i.e. `conversation` of `auto`) when a VAD start event + // occurs. + InterruptResponse bool `json:"interrupt_response"` + // JSON contains metadata for fields, check presence with [respjson.Field.Valid]. + JSON struct { + Type respjson.Field + CreateResponse respjson.Field + Eagerness respjson.Field + InterruptResponse respjson.Field + ExtraFields map[string]respjson.Field + raw string + } `json:"-"` +} + +// Returns the unmodified JSON received from the API +func (r RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad) RawJSON() string { + return r.JSON.raw +} +func (r *RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad) UnmarshalJSON(data []byte) error { return apijson.UnmarshalRoot(data, r) } @@ -1152,7 +1278,8 @@ type ClientSecretNewResponseSessionUnionAudioInput struct { NoiseReduction ClientSecretNewResponseSessionUnionAudioInputNoiseReduction `json:"noise_reduction"` // This field is from variant [RealtimeSessionCreateResponseAudioInput]. Transcription AudioTranscription `json:"transcription"` - // This field is a union of [RealtimeSessionCreateResponseAudioInputTurnDetection], + // This field is a union of + // [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion], // [RealtimeTranscriptionSessionTurnDetection] TurnDetection ClientSecretNewResponseSessionUnionAudioInputTurnDetection `json:"turn_detection"` JSON struct { @@ -1197,31 +1324,27 @@ func (r *ClientSecretNewResponseSessionUnionAudioInputNoiseReduction) UnmarshalJ // For type safety it is recommended to directly use a variant of the // [ClientSecretNewResponseSessionUnion]. type ClientSecretNewResponseSessionUnionAudioInputTurnDetection struct { + Type string `json:"type"` + CreateResponse bool `json:"create_response"` // This field is from variant - // [RealtimeSessionCreateResponseAudioInputTurnDetection]. - CreateResponse bool `json:"create_response"` - // This field is from variant - // [RealtimeSessionCreateResponseAudioInputTurnDetection]. - Eagerness string `json:"eagerness"` - // This field is from variant - // [RealtimeSessionCreateResponseAudioInputTurnDetection]. - IdleTimeoutMs int64 `json:"idle_timeout_ms"` - // This field is from variant - // [RealtimeSessionCreateResponseAudioInputTurnDetection]. + // [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion]. + IdleTimeoutMs int64 `json:"idle_timeout_ms"` InterruptResponse bool `json:"interrupt_response"` PrefixPaddingMs int64 `json:"prefix_padding_ms"` SilenceDurationMs int64 `json:"silence_duration_ms"` Threshold float64 `json:"threshold"` - Type string `json:"type"` - JSON struct { + // This field is from variant + // [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion]. + Eagerness string `json:"eagerness"` + JSON struct { + Type respjson.Field CreateResponse respjson.Field - Eagerness respjson.Field IdleTimeoutMs respjson.Field InterruptResponse respjson.Field PrefixPaddingMs respjson.Field SilenceDurationMs respjson.Field Threshold respjson.Field - Type respjson.Field + Eagerness respjson.Field raw string } `json:"-"` } @@ -1518,45 +1641,49 @@ func (u clientSecretNewParamsSessionUnionAudioInput) GetTranscription() *AudioTr func (u clientSecretNewParamsSessionUnionAudioInput) GetTurnDetection() (res clientSecretNewParamsSessionUnionAudioInputTurnDetection) { switch vt := u.any.(type) { case *RealtimeAudioConfigInputParam: - res.any = &vt.TurnDetection + res.any = vt.TurnDetection case *RealtimeTranscriptionSessionAudioInputParam: - res.any = &vt.TurnDetection + res.any = vt.TurnDetection } return res } -// Can have the runtime types [*RealtimeAudioInputTurnDetectionParam], -// [*RealtimeTranscriptionSessionAudioInputTurnDetectionParam] +// Can have the runtime types [*RealtimeAudioInputTurnDetectionServerVadParam], +// [*RealtimeAudioInputTurnDetectionSemanticVadParam], +// [*RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam], +// [*RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam] type clientSecretNewParamsSessionUnionAudioInputTurnDetection struct{ any } // Use the following switch statement to get the type of the union: // // switch u.AsAny().(type) { -// case *realtime.RealtimeAudioInputTurnDetectionParam: -// case *realtime.RealtimeTranscriptionSessionAudioInputTurnDetectionParam: +// case *realtime.RealtimeAudioInputTurnDetectionServerVadParam: +// case *realtime.RealtimeAudioInputTurnDetectionSemanticVadParam: +// case *realtime.RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam: +// case *realtime.RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam: // default: // fmt.Errorf("not present") // } func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) AsAny() any { return u.any } // Returns a pointer to the underlying variant's property, if present. -func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetCreateResponse() *bool { +func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetType() *string { switch vt := u.any.(type) { - case *RealtimeAudioInputTurnDetectionParam: - return paramutil.AddrIfPresent(vt.CreateResponse) - case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam: - return paramutil.AddrIfPresent(vt.CreateResponse) + case *RealtimeAudioInputTurnDetectionUnionParam: + return vt.GetType() + case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam: + return vt.GetType() } return nil } // Returns a pointer to the underlying variant's property, if present. -func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetEagerness() *string { +func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetCreateResponse() *bool { switch vt := u.any.(type) { - case *RealtimeAudioInputTurnDetectionParam: - return (*string)(&vt.Eagerness) - case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam: - return (*string)(&vt.Eagerness) + case *RealtimeAudioInputTurnDetectionUnionParam: + return vt.GetCreateResponse() + case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam: + return vt.GetCreateResponse() } return nil } @@ -1564,10 +1691,10 @@ func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetEagerness() // Returns a pointer to the underlying variant's property, if present. func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetIdleTimeoutMs() *int64 { switch vt := u.any.(type) { - case *RealtimeAudioInputTurnDetectionParam: - return paramutil.AddrIfPresent(vt.IdleTimeoutMs) - case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam: - return paramutil.AddrIfPresent(vt.IdleTimeoutMs) + case *RealtimeAudioInputTurnDetectionUnionParam: + return vt.GetIdleTimeoutMs() + case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam: + return vt.GetIdleTimeoutMs() } return nil } @@ -1575,10 +1702,10 @@ func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetIdleTimeout // Returns a pointer to the underlying variant's property, if present. func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetInterruptResponse() *bool { switch vt := u.any.(type) { - case *RealtimeAudioInputTurnDetectionParam: - return paramutil.AddrIfPresent(vt.InterruptResponse) - case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam: - return paramutil.AddrIfPresent(vt.InterruptResponse) + case *RealtimeAudioInputTurnDetectionUnionParam: + return vt.GetInterruptResponse() + case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam: + return vt.GetInterruptResponse() } return nil } @@ -1586,10 +1713,10 @@ func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetInterruptRe // Returns a pointer to the underlying variant's property, if present. func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetPrefixPaddingMs() *int64 { switch vt := u.any.(type) { - case *RealtimeAudioInputTurnDetectionParam: - return paramutil.AddrIfPresent(vt.PrefixPaddingMs) - case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam: - return paramutil.AddrIfPresent(vt.PrefixPaddingMs) + case *RealtimeAudioInputTurnDetectionUnionParam: + return vt.GetPrefixPaddingMs() + case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam: + return vt.GetPrefixPaddingMs() } return nil } @@ -1597,10 +1724,10 @@ func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetPrefixPaddi // Returns a pointer to the underlying variant's property, if present. func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetSilenceDurationMs() *int64 { switch vt := u.any.(type) { - case *RealtimeAudioInputTurnDetectionParam: - return paramutil.AddrIfPresent(vt.SilenceDurationMs) - case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam: - return paramutil.AddrIfPresent(vt.SilenceDurationMs) + case *RealtimeAudioInputTurnDetectionUnionParam: + return vt.GetSilenceDurationMs() + case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam: + return vt.GetSilenceDurationMs() } return nil } @@ -1608,21 +1735,21 @@ func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetSilenceDura // Returns a pointer to the underlying variant's property, if present. func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetThreshold() *float64 { switch vt := u.any.(type) { - case *RealtimeAudioInputTurnDetectionParam: - return paramutil.AddrIfPresent(vt.Threshold) - case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam: - return paramutil.AddrIfPresent(vt.Threshold) + case *RealtimeAudioInputTurnDetectionUnionParam: + return vt.GetThreshold() + case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam: + return vt.GetThreshold() } return nil } // Returns a pointer to the underlying variant's property, if present. -func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetType() *string { +func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetEagerness() *string { switch vt := u.any.(type) { - case *RealtimeAudioInputTurnDetectionParam: - return (*string)(&vt.Type) - case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam: - return (*string)(&vt.Type) + case *RealtimeAudioInputTurnDetectionUnionParam: + return vt.GetEagerness() + case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam: + return vt.GetEagerness() } return nil } diff --git a/realtime/clientsecret_test.go b/realtime/clientsecret_test.go index 37ceb80..280d3c4 100644 --- a/realtime/clientsecret_test.go +++ b/realtime/clientsecret_test.go @@ -51,15 +51,15 @@ func TestClientSecretNewWithOptionalParams(t *testing.T) { Model: realtime.AudioTranscriptionModelWhisper1, Prompt: openai.String("prompt"), }, - TurnDetection: realtime.RealtimeAudioInputTurnDetectionParam{ - CreateResponse: openai.Bool(true), - Eagerness: realtime.RealtimeAudioInputTurnDetectionEagernessLow, - IdleTimeoutMs: openai.Int(0), - InterruptResponse: openai.Bool(true), - PrefixPaddingMs: openai.Int(0), - SilenceDurationMs: openai.Int(0), - Threshold: openai.Float(0), - Type: realtime.RealtimeAudioInputTurnDetectionTypeServerVad, + TurnDetection: realtime.RealtimeAudioInputTurnDetectionUnionParam{ + OfServerVad: &realtime.RealtimeAudioInputTurnDetectionServerVadParam{ + CreateResponse: openai.Bool(true), + IdleTimeoutMs: openai.Int(5000), + InterruptResponse: openai.Bool(true), + PrefixPaddingMs: openai.Int(0), + SilenceDurationMs: openai.Int(0), + Threshold: openai.Float(0), + }, }, }, Output: realtime.RealtimeAudioConfigOutputParam{ diff --git a/realtime/realtime.go b/realtime/realtime.go index f440e8d..a38db3c 100644 --- a/realtime/realtime.go +++ b/realtime/realtime.go @@ -141,6 +141,20 @@ func (r *RealtimeAudioConfigParam) UnmarshalJSON(data []byte) error { } type RealtimeAudioConfigInputParam struct { + // Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + // set to `null` to turn off, in which case the client must manually trigger model + // response. + // + // Server VAD means that the model will detect the start and end of speech based on + // audio volume and respond at the end of user speech. + // + // Semantic VAD is more advanced and uses a turn detection model (in conjunction + // with VAD) to semantically estimate whether the user has finished speaking, then + // dynamically sets a timeout based on this probability. For example, if user audio + // trails off with "uhhm", the model will score a low probability of turn end and + // wait longer for the user to continue speaking. This can be useful for more + // natural conversations, but may have a higher latency. + TurnDetection RealtimeAudioInputTurnDetectionUnionParam `json:"turn_detection,omitzero"` // The format of the input audio. Format RealtimeAudioFormatsUnionParam `json:"format,omitzero"` // Configuration for input audio noise reduction. This can be set to `null` to turn @@ -158,17 +172,6 @@ type RealtimeAudioConfigInputParam struct { // what the model heard. The client can optionally set the language and prompt for // transcription, these offer additional guidance to the transcription service. Transcription AudioTranscriptionParam `json:"transcription,omitzero"` - // Configuration for turn detection, ether Server VAD or Semantic VAD. This can be - // set to `null` to turn off, in which case the client must manually trigger model - // response. Server VAD means that the model will detect the start and end of - // speech based on audio volume and respond at the end of user speech. Semantic VAD - // is more advanced and uses a turn detection model (in conjunction with VAD) to - // semantically estimate whether the user has finished speaking, then dynamically - // sets a timeout based on this probability. For example, if user audio trails off - // with "uhhm", the model will score a low probability of turn end and wait longer - // for the user to continue speaking. This can be useful for more natural - // conversations, but may have a higher latency. - TurnDetection RealtimeAudioInputTurnDetectionParam `json:"turn_detection,omitzero"` paramObj } @@ -530,19 +533,126 @@ func init() { ) } -// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be -// set to `null` to turn off, in which case the client must manually trigger model -// response. Server VAD means that the model will detect the start and end of -// speech based on audio volume and respond at the end of user speech. Semantic VAD -// is more advanced and uses a turn detection model (in conjunction with VAD) to -// semantically estimate whether the user has finished speaking, then dynamically -// sets a timeout based on this probability. For example, if user audio trails off -// with "uhhm", the model will score a low probability of turn end and wait longer -// for the user to continue speaking. This can be useful for more natural -// conversations, but may have a higher latency. -type RealtimeAudioInputTurnDetectionParam struct { - // Optional idle timeout after which turn detection will auto-timeout when no - // additional audio is received and emits a `timeout_triggered` event. +// Only one field can be non-zero. +// +// Use [param.IsOmitted] to confirm if a field is set. +type RealtimeAudioInputTurnDetectionUnionParam struct { + OfServerVad *RealtimeAudioInputTurnDetectionServerVadParam `json:",omitzero,inline"` + OfSemanticVad *RealtimeAudioInputTurnDetectionSemanticVadParam `json:",omitzero,inline"` + paramUnion +} + +func (u RealtimeAudioInputTurnDetectionUnionParam) MarshalJSON() ([]byte, error) { + return param.MarshalUnion(u, u.OfServerVad, u.OfSemanticVad) +} +func (u *RealtimeAudioInputTurnDetectionUnionParam) UnmarshalJSON(data []byte) error { + return apijson.UnmarshalRoot(data, u) +} + +func (u *RealtimeAudioInputTurnDetectionUnionParam) asAny() any { + if !param.IsOmitted(u.OfServerVad) { + return u.OfServerVad + } else if !param.IsOmitted(u.OfSemanticVad) { + return u.OfSemanticVad + } + return nil +} + +// Returns a pointer to the underlying variant's property, if present. +func (u RealtimeAudioInputTurnDetectionUnionParam) GetIdleTimeoutMs() *int64 { + if vt := u.OfServerVad; vt != nil && vt.IdleTimeoutMs.Valid() { + return &vt.IdleTimeoutMs.Value + } + return nil +} + +// Returns a pointer to the underlying variant's property, if present. +func (u RealtimeAudioInputTurnDetectionUnionParam) GetPrefixPaddingMs() *int64 { + if vt := u.OfServerVad; vt != nil && vt.PrefixPaddingMs.Valid() { + return &vt.PrefixPaddingMs.Value + } + return nil +} + +// Returns a pointer to the underlying variant's property, if present. +func (u RealtimeAudioInputTurnDetectionUnionParam) GetSilenceDurationMs() *int64 { + if vt := u.OfServerVad; vt != nil && vt.SilenceDurationMs.Valid() { + return &vt.SilenceDurationMs.Value + } + return nil +} + +// Returns a pointer to the underlying variant's property, if present. +func (u RealtimeAudioInputTurnDetectionUnionParam) GetThreshold() *float64 { + if vt := u.OfServerVad; vt != nil && vt.Threshold.Valid() { + return &vt.Threshold.Value + } + return nil +} + +// Returns a pointer to the underlying variant's property, if present. +func (u RealtimeAudioInputTurnDetectionUnionParam) GetEagerness() *string { + if vt := u.OfSemanticVad; vt != nil { + return &vt.Eagerness + } + return nil +} + +// Returns a pointer to the underlying variant's property, if present. +func (u RealtimeAudioInputTurnDetectionUnionParam) GetType() *string { + if vt := u.OfServerVad; vt != nil { + return (*string)(&vt.Type) + } else if vt := u.OfSemanticVad; vt != nil { + return (*string)(&vt.Type) + } + return nil +} + +// Returns a pointer to the underlying variant's property, if present. +func (u RealtimeAudioInputTurnDetectionUnionParam) GetCreateResponse() *bool { + if vt := u.OfServerVad; vt != nil && vt.CreateResponse.Valid() { + return &vt.CreateResponse.Value + } else if vt := u.OfSemanticVad; vt != nil && vt.CreateResponse.Valid() { + return &vt.CreateResponse.Value + } + return nil +} + +// Returns a pointer to the underlying variant's property, if present. +func (u RealtimeAudioInputTurnDetectionUnionParam) GetInterruptResponse() *bool { + if vt := u.OfServerVad; vt != nil && vt.InterruptResponse.Valid() { + return &vt.InterruptResponse.Value + } else if vt := u.OfSemanticVad; vt != nil && vt.InterruptResponse.Valid() { + return &vt.InterruptResponse.Value + } + return nil +} + +func init() { + apijson.RegisterUnion[RealtimeAudioInputTurnDetectionUnionParam]( + "type", + apijson.Discriminator[RealtimeAudioInputTurnDetectionServerVadParam]("server_vad"), + apijson.Discriminator[RealtimeAudioInputTurnDetectionSemanticVadParam]("semantic_vad"), + ) +} + +// Server-side voice activity detection (VAD) which flips on when user speech is +// detected and off after a period of silence. +// +// The property Type is required. +type RealtimeAudioInputTurnDetectionServerVadParam struct { + // Optional timeout after which a model response will be triggered automatically. + // This is useful for situations in which a long pause from the user is unexpected, + // such as a phone call. The model will effectively prompt the user to continue the + // conversation based on the current context. + // + // The timeout value will be applied after the last model response's audio has + // finished playing, i.e. it's set to the `response.done` time plus audio playback + // duration. + // + // An `input_audio_buffer.timeout_triggered` event (plus events associated with the + // Response) will be emitted when the timeout is reached. Idle timeout is currently + // only supported for `server_vad` mode. IdleTimeoutMs param.Opt[int64] `json:"idle_timeout_ms,omitzero"` // Whether or not to automatically generate a response when a VAD stop event // occurs. @@ -562,48 +672,60 @@ type RealtimeAudioInputTurnDetectionParam struct { // defaults to 0.5. A higher threshold will require louder audio to activate the // model, and thus might perform better in noisy environments. Threshold param.Opt[float64] `json:"threshold,omitzero"` + // Type of turn detection, `server_vad` to turn on simple Server VAD. + // + // This field can be elided, and will marshal its zero value as "server_vad". + Type constant.ServerVad `json:"type,required"` + paramObj +} + +func (r RealtimeAudioInputTurnDetectionServerVadParam) MarshalJSON() (data []byte, err error) { + type shadow RealtimeAudioInputTurnDetectionServerVadParam + return param.MarshalObject(r, (*shadow)(&r)) +} +func (r *RealtimeAudioInputTurnDetectionServerVadParam) UnmarshalJSON(data []byte) error { + return apijson.UnmarshalRoot(data, r) +} + +// Server-side semantic turn detection which uses a model to determine when the +// user has finished speaking. +// +// The property Type is required. +type RealtimeAudioInputTurnDetectionSemanticVadParam struct { + // Whether or not to automatically generate a response when a VAD stop event + // occurs. + CreateResponse param.Opt[bool] `json:"create_response,omitzero"` + // Whether or not to automatically interrupt any ongoing response with output to + // the default conversation (i.e. `conversation` of `auto`) when a VAD start event + // occurs. + InterruptResponse param.Opt[bool] `json:"interrupt_response,omitzero"` // Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` // will wait longer for the user to continue speaking, `high` will respond more // quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, // and `high` have max timeouts of 8s, 4s, and 2s respectively. // // Any of "low", "medium", "high", "auto". - Eagerness RealtimeAudioInputTurnDetectionEagerness `json:"eagerness,omitzero"` - // Type of turn detection. + Eagerness string `json:"eagerness,omitzero"` + // Type of turn detection, `semantic_vad` to turn on Semantic VAD. // - // Any of "server_vad", "semantic_vad". - Type RealtimeAudioInputTurnDetectionType `json:"type,omitzero"` + // This field can be elided, and will marshal its zero value as "semantic_vad". + Type constant.SemanticVad `json:"type,required"` paramObj } -func (r RealtimeAudioInputTurnDetectionParam) MarshalJSON() (data []byte, err error) { - type shadow RealtimeAudioInputTurnDetectionParam +func (r RealtimeAudioInputTurnDetectionSemanticVadParam) MarshalJSON() (data []byte, err error) { + type shadow RealtimeAudioInputTurnDetectionSemanticVadParam return param.MarshalObject(r, (*shadow)(&r)) } -func (r *RealtimeAudioInputTurnDetectionParam) UnmarshalJSON(data []byte) error { +func (r *RealtimeAudioInputTurnDetectionSemanticVadParam) UnmarshalJSON(data []byte) error { return apijson.UnmarshalRoot(data, r) } -// Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` -// will wait longer for the user to continue speaking, `high` will respond more -// quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, -// and `high` have max timeouts of 8s, 4s, and 2s respectively. -type RealtimeAudioInputTurnDetectionEagerness string - -const ( - RealtimeAudioInputTurnDetectionEagernessLow RealtimeAudioInputTurnDetectionEagerness = "low" - RealtimeAudioInputTurnDetectionEagernessMedium RealtimeAudioInputTurnDetectionEagerness = "medium" - RealtimeAudioInputTurnDetectionEagernessHigh RealtimeAudioInputTurnDetectionEagerness = "high" - RealtimeAudioInputTurnDetectionEagernessAuto RealtimeAudioInputTurnDetectionEagerness = "auto" -) - -// Type of turn detection. -type RealtimeAudioInputTurnDetectionType string - -const ( - RealtimeAudioInputTurnDetectionTypeServerVad RealtimeAudioInputTurnDetectionType = "server_vad" - RealtimeAudioInputTurnDetectionTypeSemanticVad RealtimeAudioInputTurnDetectionType = "semantic_vad" -) +func init() { + apijson.RegisterFieldValidator[RealtimeAudioInputTurnDetectionSemanticVadParam]( + "eagerness", "low", "medium", "high", "auto", + ) +} type RealtimeFunctionTool struct { // The description of the function, including guidance on when and how to call it, @@ -1264,6 +1386,20 @@ func (r *RealtimeTranscriptionSessionAudioParam) UnmarshalJSON(data []byte) erro } type RealtimeTranscriptionSessionAudioInputParam struct { + // Configuration for turn detection, ether Server VAD or Semantic VAD. This can be + // set to `null` to turn off, in which case the client must manually trigger model + // response. + // + // Server VAD means that the model will detect the start and end of speech based on + // audio volume and respond at the end of user speech. + // + // Semantic VAD is more advanced and uses a turn detection model (in conjunction + // with VAD) to semantically estimate whether the user has finished speaking, then + // dynamically sets a timeout based on this probability. For example, if user audio + // trails off with "uhhm", the model will score a low probability of turn end and + // wait longer for the user to continue speaking. This can be useful for more + // natural conversations, but may have a higher latency. + TurnDetection RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam `json:"turn_detection,omitzero"` // The PCM audio format. Only a 24kHz sample rate is supported. Format RealtimeAudioFormatsUnionParam `json:"format,omitzero"` // Configuration for input audio noise reduction. This can be set to `null` to turn @@ -1281,17 +1417,6 @@ type RealtimeTranscriptionSessionAudioInputParam struct { // what the model heard. The client can optionally set the language and prompt for // transcription, these offer additional guidance to the transcription service. Transcription AudioTranscriptionParam `json:"transcription,omitzero"` - // Configuration for turn detection, ether Server VAD or Semantic VAD. This can be - // set to `null` to turn off, in which case the client must manually trigger model - // response. Server VAD means that the model will detect the start and end of - // speech based on audio volume and respond at the end of user speech. Semantic VAD - // is more advanced and uses a turn detection model (in conjunction with VAD) to - // semantically estimate whether the user has finished speaking, then dynamically - // sets a timeout based on this probability. For example, if user audio trails off - // with "uhhm", the model will score a low probability of turn end and wait longer - // for the user to continue speaking. This can be useful for more natural - // conversations, but may have a higher latency. - TurnDetection RealtimeTranscriptionSessionAudioInputTurnDetectionParam `json:"turn_detection,omitzero"` paramObj } @@ -1326,19 +1451,126 @@ func (r *RealtimeTranscriptionSessionAudioInputNoiseReductionParam) UnmarshalJSO return apijson.UnmarshalRoot(data, r) } -// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be -// set to `null` to turn off, in which case the client must manually trigger model -// response. Server VAD means that the model will detect the start and end of -// speech based on audio volume and respond at the end of user speech. Semantic VAD -// is more advanced and uses a turn detection model (in conjunction with VAD) to -// semantically estimate whether the user has finished speaking, then dynamically -// sets a timeout based on this probability. For example, if user audio trails off -// with "uhhm", the model will score a low probability of turn end and wait longer -// for the user to continue speaking. This can be useful for more natural -// conversations, but may have a higher latency. -type RealtimeTranscriptionSessionAudioInputTurnDetectionParam struct { - // Optional idle timeout after which turn detection will auto-timeout when no - // additional audio is received. +// Only one field can be non-zero. +// +// Use [param.IsOmitted] to confirm if a field is set. +type RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam struct { + OfServerVad *RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam `json:",omitzero,inline"` + OfSemanticVad *RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam `json:",omitzero,inline"` + paramUnion +} + +func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) MarshalJSON() ([]byte, error) { + return param.MarshalUnion(u, u.OfServerVad, u.OfSemanticVad) +} +func (u *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) UnmarshalJSON(data []byte) error { + return apijson.UnmarshalRoot(data, u) +} + +func (u *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) asAny() any { + if !param.IsOmitted(u.OfServerVad) { + return u.OfServerVad + } else if !param.IsOmitted(u.OfSemanticVad) { + return u.OfSemanticVad + } + return nil +} + +// Returns a pointer to the underlying variant's property, if present. +func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetIdleTimeoutMs() *int64 { + if vt := u.OfServerVad; vt != nil && vt.IdleTimeoutMs.Valid() { + return &vt.IdleTimeoutMs.Value + } + return nil +} + +// Returns a pointer to the underlying variant's property, if present. +func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetPrefixPaddingMs() *int64 { + if vt := u.OfServerVad; vt != nil && vt.PrefixPaddingMs.Valid() { + return &vt.PrefixPaddingMs.Value + } + return nil +} + +// Returns a pointer to the underlying variant's property, if present. +func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetSilenceDurationMs() *int64 { + if vt := u.OfServerVad; vt != nil && vt.SilenceDurationMs.Valid() { + return &vt.SilenceDurationMs.Value + } + return nil +} + +// Returns a pointer to the underlying variant's property, if present. +func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetThreshold() *float64 { + if vt := u.OfServerVad; vt != nil && vt.Threshold.Valid() { + return &vt.Threshold.Value + } + return nil +} + +// Returns a pointer to the underlying variant's property, if present. +func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetEagerness() *string { + if vt := u.OfSemanticVad; vt != nil { + return &vt.Eagerness + } + return nil +} + +// Returns a pointer to the underlying variant's property, if present. +func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetType() *string { + if vt := u.OfServerVad; vt != nil { + return (*string)(&vt.Type) + } else if vt := u.OfSemanticVad; vt != nil { + return (*string)(&vt.Type) + } + return nil +} + +// Returns a pointer to the underlying variant's property, if present. +func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetCreateResponse() *bool { + if vt := u.OfServerVad; vt != nil && vt.CreateResponse.Valid() { + return &vt.CreateResponse.Value + } else if vt := u.OfSemanticVad; vt != nil && vt.CreateResponse.Valid() { + return &vt.CreateResponse.Value + } + return nil +} + +// Returns a pointer to the underlying variant's property, if present. +func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetInterruptResponse() *bool { + if vt := u.OfServerVad; vt != nil && vt.InterruptResponse.Valid() { + return &vt.InterruptResponse.Value + } else if vt := u.OfSemanticVad; vt != nil && vt.InterruptResponse.Valid() { + return &vt.InterruptResponse.Value + } + return nil +} + +func init() { + apijson.RegisterUnion[RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam]( + "type", + apijson.Discriminator[RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam]("server_vad"), + apijson.Discriminator[RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam]("semantic_vad"), + ) +} + +// Server-side voice activity detection (VAD) which flips on when user speech is +// detected and off after a period of silence. +// +// The property Type is required. +type RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam struct { + // Optional timeout after which a model response will be triggered automatically. + // This is useful for situations in which a long pause from the user is unexpected, + // such as a phone call. The model will effectively prompt the user to continue the + // conversation based on the current context. + // + // The timeout value will be applied after the last model response's audio has + // finished playing, i.e. it's set to the `response.done` time plus audio playback + // duration. + // + // An `input_audio_buffer.timeout_triggered` event (plus events associated with the + // Response) will be emitted when the timeout is reached. Idle timeout is currently + // only supported for `server_vad` mode. IdleTimeoutMs param.Opt[int64] `json:"idle_timeout_ms,omitzero"` // Whether or not to automatically generate a response when a VAD stop event // occurs. @@ -1358,46 +1590,60 @@ type RealtimeTranscriptionSessionAudioInputTurnDetectionParam struct { // defaults to 0.5. A higher threshold will require louder audio to activate the // model, and thus might perform better in noisy environments. Threshold param.Opt[float64] `json:"threshold,omitzero"` - // Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` - // will wait longer for the user to continue speaking, `high` will respond more - // quickly. `auto` is the default and is equivalent to `medium`. + // Type of turn detection, `server_vad` to turn on simple Server VAD. // - // Any of "low", "medium", "high", "auto". - Eagerness RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness `json:"eagerness,omitzero"` - // Type of turn detection. - // - // Any of "server_vad", "semantic_vad". - Type RealtimeTranscriptionSessionAudioInputTurnDetectionType `json:"type,omitzero"` + // This field can be elided, and will marshal its zero value as "server_vad". + Type constant.ServerVad `json:"type,required"` paramObj } -func (r RealtimeTranscriptionSessionAudioInputTurnDetectionParam) MarshalJSON() (data []byte, err error) { - type shadow RealtimeTranscriptionSessionAudioInputTurnDetectionParam +func (r RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam) MarshalJSON() (data []byte, err error) { + type shadow RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam return param.MarshalObject(r, (*shadow)(&r)) } -func (r *RealtimeTranscriptionSessionAudioInputTurnDetectionParam) UnmarshalJSON(data []byte) error { +func (r *RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam) UnmarshalJSON(data []byte) error { return apijson.UnmarshalRoot(data, r) } -// Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` -// will wait longer for the user to continue speaking, `high` will respond more -// quickly. `auto` is the default and is equivalent to `medium`. -type RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness string +// Server-side semantic turn detection which uses a model to determine when the +// user has finished speaking. +// +// The property Type is required. +type RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam struct { + // Whether or not to automatically generate a response when a VAD stop event + // occurs. + CreateResponse param.Opt[bool] `json:"create_response,omitzero"` + // Whether or not to automatically interrupt any ongoing response with output to + // the default conversation (i.e. `conversation` of `auto`) when a VAD start event + // occurs. + InterruptResponse param.Opt[bool] `json:"interrupt_response,omitzero"` + // Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` + // will wait longer for the user to continue speaking, `high` will respond more + // quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`, + // and `high` have max timeouts of 8s, 4s, and 2s respectively. + // + // Any of "low", "medium", "high", "auto". + Eagerness string `json:"eagerness,omitzero"` + // Type of turn detection, `semantic_vad` to turn on Semantic VAD. + // + // This field can be elided, and will marshal its zero value as "semantic_vad". + Type constant.SemanticVad `json:"type,required"` + paramObj +} -const ( - RealtimeTranscriptionSessionAudioInputTurnDetectionEagernessLow RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness = "low" - RealtimeTranscriptionSessionAudioInputTurnDetectionEagernessMedium RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness = "medium" - RealtimeTranscriptionSessionAudioInputTurnDetectionEagernessHigh RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness = "high" - RealtimeTranscriptionSessionAudioInputTurnDetectionEagernessAuto RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness = "auto" -) +func (r RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam) MarshalJSON() (data []byte, err error) { + type shadow RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam + return param.MarshalObject(r, (*shadow)(&r)) +} +func (r *RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam) UnmarshalJSON(data []byte) error { + return apijson.UnmarshalRoot(data, r) +} -// Type of turn detection. -type RealtimeTranscriptionSessionAudioInputTurnDetectionType string - -const ( - RealtimeTranscriptionSessionAudioInputTurnDetectionTypeServerVad RealtimeTranscriptionSessionAudioInputTurnDetectionType = "server_vad" - RealtimeTranscriptionSessionAudioInputTurnDetectionTypeSemanticVad RealtimeTranscriptionSessionAudioInputTurnDetectionType = "semantic_vad" -) +func init() { + apijson.RegisterFieldValidator[RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam]( + "eagerness", "low", "medium", "high", "auto", + ) +} // Realtime transcription session object configuration. // diff --git a/responses/response.go b/responses/response.go index 2eb655e..e3b20d5 100644 --- a/responses/response.go +++ b/responses/response.go @@ -879,10 +879,10 @@ type Response struct { TopLogprobs int64 `json:"top_logprobs,nullable"` // The truncation strategy to use for the model response. // - // - `auto`: If the context of this response and previous ones exceeds the model's - // context window size, the model will truncate the response to fit the context - // window by dropping input items in the middle of the conversation. - // - `disabled` (default): If a model response will exceed the context window size + // - `auto`: If the input to this Response exceeds the model's context window size, + // the model will truncate the response to fit the context window by dropping + // items from the beginning of the conversation. + // - `disabled` (default): If the input size will exceed the context window size // for a model, the request will fail with a 400 error. // // Any of "auto", "disabled". @@ -1125,10 +1125,10 @@ const ( // The truncation strategy to use for the model response. // -// - `auto`: If the context of this response and previous ones exceeds the model's -// context window size, the model will truncate the response to fit the context -// window by dropping input items in the middle of the conversation. -// - `disabled` (default): If a model response will exceed the context window size +// - `auto`: If the input to this Response exceeds the model's context window size, +// the model will truncate the response to fit the context window by dropping +// items from the beginning of the conversation. +// - `disabled` (default): If the input size will exceed the context window size // for a model, the request will fail with a 400 error. type ResponseTruncation string @@ -14285,10 +14285,10 @@ type ResponseNewParams struct { StreamOptions ResponseNewParamsStreamOptions `json:"stream_options,omitzero"` // The truncation strategy to use for the model response. // - // - `auto`: If the context of this response and previous ones exceeds the model's - // context window size, the model will truncate the response to fit the context - // window by dropping input items in the middle of the conversation. - // - `disabled` (default): If a model response will exceed the context window size + // - `auto`: If the input to this Response exceeds the model's context window size, + // the model will truncate the response to fit the context window by dropping + // items from the beginning of the conversation. + // - `disabled` (default): If the input size will exceed the context window size // for a model, the request will fail with a 400 error. // // Any of "auto", "disabled". @@ -14548,10 +14548,10 @@ func (u ResponseNewParamsToolChoiceUnion) GetName() *string { // The truncation strategy to use for the model response. // -// - `auto`: If the context of this response and previous ones exceeds the model's -// context window size, the model will truncate the response to fit the context -// window by dropping input items in the middle of the conversation. -// - `disabled` (default): If a model response will exceed the context window size +// - `auto`: If the input to this Response exceeds the model's context window size, +// the model will truncate the response to fit the context window by dropping +// items from the beginning of the conversation. +// - `disabled` (default): If the input size will exceed the context window size // for a model, the request will fail with a 400 error. type ResponseNewParamsTruncation string diff --git a/shared/constant/constants.go b/shared/constant/constants.go index c2b547a..de5e32a 100644 --- a/shared/constant/constants.go +++ b/shared/constant/constants.go @@ -217,6 +217,8 @@ type ScoreModel string // Always "score_mo type Screenshot string // Always "screenshot" type Scroll string // Always "scroll" type Search string // Always "search" +type SemanticVad string // Always "semantic_vad" +type ServerVad string // Always "server_vad" type SessionCreated string // Always "session.created" type SessionUpdate string // Always "session.update" type SessionUpdated string // Always "session.updated" @@ -610,6 +612,8 @@ func (c ScoreModel) Default() ScoreModel { return "score func (c Screenshot) Default() Screenshot { return "screenshot" } func (c Scroll) Default() Scroll { return "scroll" } func (c Search) Default() Search { return "search" } +func (c SemanticVad) Default() SemanticVad { return "semantic_vad" } +func (c ServerVad) Default() ServerVad { return "server_vad" } func (c SessionCreated) Default() SessionCreated { return "session.created" } func (c SessionUpdate) Default() SessionUpdate { return "session.update" } func (c SessionUpdated) Default() SessionUpdated { return "session.updated" } @@ -903,6 +907,8 @@ func (c ScoreModel) MarshalJSON() ([]byte, error) { retu func (c Screenshot) MarshalJSON() ([]byte, error) { return marshalString(c) } func (c Scroll) MarshalJSON() ([]byte, error) { return marshalString(c) } func (c Search) MarshalJSON() ([]byte, error) { return marshalString(c) } +func (c SemanticVad) MarshalJSON() ([]byte, error) { return marshalString(c) } +func (c ServerVad) MarshalJSON() ([]byte, error) { return marshalString(c) } func (c SessionCreated) MarshalJSON() ([]byte, error) { return marshalString(c) } func (c SessionUpdate) MarshalJSON() ([]byte, error) { return marshalString(c) } func (c SessionUpdated) MarshalJSON() ([]byte, error) { return marshalString(c) }