chore(api): Minor docs and type updates for realtime

This commit is contained in:
stainless-app[bot]
2025-09-11 18:03:52 +00:00
parent f03535ac39
commit d92ea4850f
7 changed files with 605 additions and 226 deletions

View File

@@ -1,4 +1,4 @@
configured_endpoints: 106
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-16cb18bed32bae8c5840fb39a1bf664026cc40463ad0c487dcb0df1bd3d72db0.yml
openapi_spec_hash: 4cb51b22f98dee1a90bc7add82d1d132
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-c829f9e7f51d4946dae7b02eb37eb857b538a464cf54c7ced5eff1b1c93e07db.yml
openapi_spec_hash: 1b2eaba46b264bcec8831bc496543649
config_hash: 930dac3aa861344867e4ac84f037b5df

4
api.md
View File

@@ -793,7 +793,7 @@ Params Types:
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeAudioConfigInputParam">RealtimeAudioConfigInputParam</a>
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeAudioConfigOutputParam">RealtimeAudioConfigOutputParam</a>
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeAudioFormatsUnionParam">RealtimeAudioFormatsUnionParam</a>
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeAudioInputTurnDetectionParam">RealtimeAudioInputTurnDetectionParam</a>
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeAudioInputTurnDetectionUnionParam">RealtimeAudioInputTurnDetectionUnionParam</a>
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeFunctionToolParam">RealtimeFunctionToolParam</a>
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeSessionCreateRequestParam">RealtimeSessionCreateRequestParam</a>
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeToolChoiceConfigUnionParam">RealtimeToolChoiceConfigUnionParam</a>
@@ -802,7 +802,7 @@ Params Types:
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeTracingConfigUnionParam">RealtimeTracingConfigUnionParam</a>
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeTranscriptionSessionAudioParam">RealtimeTranscriptionSessionAudioParam</a>
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeTranscriptionSessionAudioInputParam">RealtimeTranscriptionSessionAudioInputParam</a>
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeTranscriptionSessionAudioInputTurnDetectionParam">RealtimeTranscriptionSessionAudioInputTurnDetectionParam</a>
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam">RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam</a>
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeTranscriptionSessionCreateRequestParam">RealtimeTranscriptionSessionCreateRequestParam</a>
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeTruncationUnionParam">RealtimeTruncationUnionParam</a>
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeTruncationRetentionRatioParam">RealtimeTruncationRetentionRatioParam</a>

View File

@@ -8,7 +8,6 @@ import (
"net/http"
"github.com/openai/openai-go/v2/internal/apijson"
"github.com/openai/openai-go/v2/internal/paramutil"
"github.com/openai/openai-go/v2/internal/requestconfig"
"github.com/openai/openai-go/v2/option"
"github.com/openai/openai-go/v2/packages/param"
@@ -192,15 +191,18 @@ type RealtimeSessionCreateResponseAudioInput struct {
Transcription AudioTranscription `json:"transcription"`
// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
// set to `null` to turn off, in which case the client must manually trigger model
// response. Server VAD means that the model will detect the start and end of
// speech based on audio volume and respond at the end of user speech. Semantic VAD
// is more advanced and uses a turn detection model (in conjunction with VAD) to
// semantically estimate whether the user has finished speaking, then dynamically
// sets a timeout based on this probability. For example, if user audio trails off
// with "uhhm", the model will score a low probability of turn end and wait longer
// for the user to continue speaking. This can be useful for more natural
// conversations, but may have a higher latency.
TurnDetection RealtimeSessionCreateResponseAudioInputTurnDetection `json:"turn_detection"`
// response.
//
// Server VAD means that the model will detect the start and end of speech based on
// audio volume and respond at the end of user speech.
//
// Semantic VAD is more advanced and uses a turn detection model (in conjunction
// with VAD) to semantically estimate whether the user has finished speaking, then
// dynamically sets a timeout based on this probability. For example, if user audio
// trails off with "uhhm", the model will score a low probability of turn end and
// wait longer for the user to continue speaking. This can be useful for more
// natural conversations, but may have a higher latency.
TurnDetection RealtimeSessionCreateResponseAudioInputTurnDetectionUnion `json:"turn_detection,nullable"`
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
JSON struct {
Format respjson.Field
@@ -244,29 +246,118 @@ func (r *RealtimeSessionCreateResponseAudioInputNoiseReduction) UnmarshalJSON(da
return apijson.UnmarshalRoot(data, r)
}
// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
// set to `null` to turn off, in which case the client must manually trigger model
// response. Server VAD means that the model will detect the start and end of
// speech based on audio volume and respond at the end of user speech. Semantic VAD
// is more advanced and uses a turn detection model (in conjunction with VAD) to
// semantically estimate whether the user has finished speaking, then dynamically
// sets a timeout based on this probability. For example, if user audio trails off
// with "uhhm", the model will score a low probability of turn end and wait longer
// for the user to continue speaking. This can be useful for more natural
// conversations, but may have a higher latency.
type RealtimeSessionCreateResponseAudioInputTurnDetection struct {
// RealtimeSessionCreateResponseAudioInputTurnDetectionUnion contains all possible
// properties and values from
// [RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad],
// [RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad].
//
// Use the [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion.AsAny] method
// to switch on the variant.
//
// Use the methods beginning with 'As' to cast the union to one of its variants.
type RealtimeSessionCreateResponseAudioInputTurnDetectionUnion struct {
// Any of "server_vad", "semantic_vad".
Type string `json:"type"`
CreateResponse bool `json:"create_response"`
// This field is from variant
// [RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad].
IdleTimeoutMs int64 `json:"idle_timeout_ms"`
InterruptResponse bool `json:"interrupt_response"`
// This field is from variant
// [RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad].
PrefixPaddingMs int64 `json:"prefix_padding_ms"`
// This field is from variant
// [RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad].
SilenceDurationMs int64 `json:"silence_duration_ms"`
// This field is from variant
// [RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad].
Threshold float64 `json:"threshold"`
// This field is from variant
// [RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad].
Eagerness string `json:"eagerness"`
JSON struct {
Type respjson.Field
CreateResponse respjson.Field
IdleTimeoutMs respjson.Field
InterruptResponse respjson.Field
PrefixPaddingMs respjson.Field
SilenceDurationMs respjson.Field
Threshold respjson.Field
Eagerness respjson.Field
raw string
} `json:"-"`
}
// anyRealtimeSessionCreateResponseAudioInputTurnDetection is implemented by each
// variant of [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion] to add
// type safety for the return type of
// [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion.AsAny]
type anyRealtimeSessionCreateResponseAudioInputTurnDetection interface {
implRealtimeSessionCreateResponseAudioInputTurnDetectionUnion()
}
func (RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad) implRealtimeSessionCreateResponseAudioInputTurnDetectionUnion() {
}
func (RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad) implRealtimeSessionCreateResponseAudioInputTurnDetectionUnion() {
}
// Use the following switch statement to find the correct variant
//
// switch variant := RealtimeSessionCreateResponseAudioInputTurnDetectionUnion.AsAny().(type) {
// case realtime.RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad:
// case realtime.RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad:
// default:
// fmt.Errorf("no variant present")
// }
func (u RealtimeSessionCreateResponseAudioInputTurnDetectionUnion) AsAny() anyRealtimeSessionCreateResponseAudioInputTurnDetection {
switch u.Type {
case "server_vad":
return u.AsServerVad()
case "semantic_vad":
return u.AsSemanticVad()
}
return nil
}
func (u RealtimeSessionCreateResponseAudioInputTurnDetectionUnion) AsServerVad() (v RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad) {
apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v)
return
}
func (u RealtimeSessionCreateResponseAudioInputTurnDetectionUnion) AsSemanticVad() (v RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad) {
apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v)
return
}
// Returns the unmodified JSON received from the API
func (u RealtimeSessionCreateResponseAudioInputTurnDetectionUnion) RawJSON() string {
return u.JSON.raw
}
func (r *RealtimeSessionCreateResponseAudioInputTurnDetectionUnion) UnmarshalJSON(data []byte) error {
return apijson.UnmarshalRoot(data, r)
}
// Server-side voice activity detection (VAD) which flips on when user speech is
// detected and off after a period of silence.
type RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad struct {
// Type of turn detection, `server_vad` to turn on simple Server VAD.
Type constant.ServerVad `json:"type,required"`
// Whether or not to automatically generate a response when a VAD stop event
// occurs.
CreateResponse bool `json:"create_response"`
// Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
// will wait longer for the user to continue speaking, `high` will respond more
// quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
// and `high` have max timeouts of 8s, 4s, and 2s respectively.
// Optional timeout after which a model response will be triggered automatically.
// This is useful for situations in which a long pause from the user is unexpected,
// such as a phone call. The model will effectively prompt the user to continue the
// conversation based on the current context.
//
// Any of "low", "medium", "high", "auto".
Eagerness string `json:"eagerness"`
// Optional idle timeout after which turn detection will auto-timeout when no
// additional audio is received and emits a `timeout_triggered` event.
// The timeout value will be applied after the last model response's audio has
// finished playing, i.e. it's set to the `response.done` time plus audio playback
// duration.
//
// An `input_audio_buffer.timeout_triggered` event (plus events associated with the
// Response) will be emitted when the timeout is reached. Idle timeout is currently
// only supported for `server_vad` mode.
IdleTimeoutMs int64 `json:"idle_timeout_ms,nullable"`
// Whether or not to automatically interrupt any ongoing response with output to
// the default conversation (i.e. `conversation` of `auto`) when a VAD start event
@@ -283,28 +374,63 @@ type RealtimeSessionCreateResponseAudioInputTurnDetection struct {
// defaults to 0.5. A higher threshold will require louder audio to activate the
// model, and thus might perform better in noisy environments.
Threshold float64 `json:"threshold"`
// Type of turn detection.
//
// Any of "server_vad", "semantic_vad".
Type string `json:"type"`
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
JSON struct {
Type respjson.Field
CreateResponse respjson.Field
Eagerness respjson.Field
IdleTimeoutMs respjson.Field
InterruptResponse respjson.Field
PrefixPaddingMs respjson.Field
SilenceDurationMs respjson.Field
Threshold respjson.Field
Type respjson.Field
ExtraFields map[string]respjson.Field
raw string
} `json:"-"`
}
// Returns the unmodified JSON received from the API
func (r RealtimeSessionCreateResponseAudioInputTurnDetection) RawJSON() string { return r.JSON.raw }
func (r *RealtimeSessionCreateResponseAudioInputTurnDetection) UnmarshalJSON(data []byte) error {
func (r RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad) RawJSON() string {
return r.JSON.raw
}
func (r *RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad) UnmarshalJSON(data []byte) error {
return apijson.UnmarshalRoot(data, r)
}
// Server-side semantic turn detection which uses a model to determine when the
// user has finished speaking.
type RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad struct {
// Type of turn detection, `semantic_vad` to turn on Semantic VAD.
Type constant.SemanticVad `json:"type,required"`
// Whether or not to automatically generate a response when a VAD stop event
// occurs.
CreateResponse bool `json:"create_response"`
// Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
// will wait longer for the user to continue speaking, `high` will respond more
// quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
// and `high` have max timeouts of 8s, 4s, and 2s respectively.
//
// Any of "low", "medium", "high", "auto".
Eagerness string `json:"eagerness"`
// Whether or not to automatically interrupt any ongoing response with output to
// the default conversation (i.e. `conversation` of `auto`) when a VAD start event
// occurs.
InterruptResponse bool `json:"interrupt_response"`
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
JSON struct {
Type respjson.Field
CreateResponse respjson.Field
Eagerness respjson.Field
InterruptResponse respjson.Field
ExtraFields map[string]respjson.Field
raw string
} `json:"-"`
}
// Returns the unmodified JSON received from the API
func (r RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad) RawJSON() string {
return r.JSON.raw
}
func (r *RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad) UnmarshalJSON(data []byte) error {
return apijson.UnmarshalRoot(data, r)
}
@@ -1152,7 +1278,8 @@ type ClientSecretNewResponseSessionUnionAudioInput struct {
NoiseReduction ClientSecretNewResponseSessionUnionAudioInputNoiseReduction `json:"noise_reduction"`
// This field is from variant [RealtimeSessionCreateResponseAudioInput].
Transcription AudioTranscription `json:"transcription"`
// This field is a union of [RealtimeSessionCreateResponseAudioInputTurnDetection],
// This field is a union of
// [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion],
// [RealtimeTranscriptionSessionTurnDetection]
TurnDetection ClientSecretNewResponseSessionUnionAudioInputTurnDetection `json:"turn_detection"`
JSON struct {
@@ -1197,31 +1324,27 @@ func (r *ClientSecretNewResponseSessionUnionAudioInputNoiseReduction) UnmarshalJ
// For type safety it is recommended to directly use a variant of the
// [ClientSecretNewResponseSessionUnion].
type ClientSecretNewResponseSessionUnionAudioInputTurnDetection struct {
Type string `json:"type"`
CreateResponse bool `json:"create_response"`
// This field is from variant
// [RealtimeSessionCreateResponseAudioInputTurnDetection].
CreateResponse bool `json:"create_response"`
// This field is from variant
// [RealtimeSessionCreateResponseAudioInputTurnDetection].
Eagerness string `json:"eagerness"`
// This field is from variant
// [RealtimeSessionCreateResponseAudioInputTurnDetection].
IdleTimeoutMs int64 `json:"idle_timeout_ms"`
// This field is from variant
// [RealtimeSessionCreateResponseAudioInputTurnDetection].
// [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion].
IdleTimeoutMs int64 `json:"idle_timeout_ms"`
InterruptResponse bool `json:"interrupt_response"`
PrefixPaddingMs int64 `json:"prefix_padding_ms"`
SilenceDurationMs int64 `json:"silence_duration_ms"`
Threshold float64 `json:"threshold"`
Type string `json:"type"`
JSON struct {
// This field is from variant
// [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion].
Eagerness string `json:"eagerness"`
JSON struct {
Type respjson.Field
CreateResponse respjson.Field
Eagerness respjson.Field
IdleTimeoutMs respjson.Field
InterruptResponse respjson.Field
PrefixPaddingMs respjson.Field
SilenceDurationMs respjson.Field
Threshold respjson.Field
Type respjson.Field
Eagerness respjson.Field
raw string
} `json:"-"`
}
@@ -1518,45 +1641,49 @@ func (u clientSecretNewParamsSessionUnionAudioInput) GetTranscription() *AudioTr
func (u clientSecretNewParamsSessionUnionAudioInput) GetTurnDetection() (res clientSecretNewParamsSessionUnionAudioInputTurnDetection) {
switch vt := u.any.(type) {
case *RealtimeAudioConfigInputParam:
res.any = &vt.TurnDetection
res.any = vt.TurnDetection
case *RealtimeTranscriptionSessionAudioInputParam:
res.any = &vt.TurnDetection
res.any = vt.TurnDetection
}
return res
}
// Can have the runtime types [*RealtimeAudioInputTurnDetectionParam],
// [*RealtimeTranscriptionSessionAudioInputTurnDetectionParam]
// Can have the runtime types [*RealtimeAudioInputTurnDetectionServerVadParam],
// [*RealtimeAudioInputTurnDetectionSemanticVadParam],
// [*RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam],
// [*RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam]
type clientSecretNewParamsSessionUnionAudioInputTurnDetection struct{ any }
// Use the following switch statement to get the type of the union:
//
// switch u.AsAny().(type) {
// case *realtime.RealtimeAudioInputTurnDetectionParam:
// case *realtime.RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
// case *realtime.RealtimeAudioInputTurnDetectionServerVadParam:
// case *realtime.RealtimeAudioInputTurnDetectionSemanticVadParam:
// case *realtime.RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam:
// case *realtime.RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam:
// default:
// fmt.Errorf("not present")
// }
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) AsAny() any { return u.any }
// Returns a pointer to the underlying variant's property, if present.
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetCreateResponse() *bool {
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetType() *string {
switch vt := u.any.(type) {
case *RealtimeAudioInputTurnDetectionParam:
return paramutil.AddrIfPresent(vt.CreateResponse)
case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
return paramutil.AddrIfPresent(vt.CreateResponse)
case *RealtimeAudioInputTurnDetectionUnionParam:
return vt.GetType()
case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
return vt.GetType()
}
return nil
}
// Returns a pointer to the underlying variant's property, if present.
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetEagerness() *string {
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetCreateResponse() *bool {
switch vt := u.any.(type) {
case *RealtimeAudioInputTurnDetectionParam:
return (*string)(&vt.Eagerness)
case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
return (*string)(&vt.Eagerness)
case *RealtimeAudioInputTurnDetectionUnionParam:
return vt.GetCreateResponse()
case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
return vt.GetCreateResponse()
}
return nil
}
@@ -1564,10 +1691,10 @@ func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetEagerness()
// Returns a pointer to the underlying variant's property, if present.
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetIdleTimeoutMs() *int64 {
switch vt := u.any.(type) {
case *RealtimeAudioInputTurnDetectionParam:
return paramutil.AddrIfPresent(vt.IdleTimeoutMs)
case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
return paramutil.AddrIfPresent(vt.IdleTimeoutMs)
case *RealtimeAudioInputTurnDetectionUnionParam:
return vt.GetIdleTimeoutMs()
case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
return vt.GetIdleTimeoutMs()
}
return nil
}
@@ -1575,10 +1702,10 @@ func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetIdleTimeout
// Returns a pointer to the underlying variant's property, if present.
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetInterruptResponse() *bool {
switch vt := u.any.(type) {
case *RealtimeAudioInputTurnDetectionParam:
return paramutil.AddrIfPresent(vt.InterruptResponse)
case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
return paramutil.AddrIfPresent(vt.InterruptResponse)
case *RealtimeAudioInputTurnDetectionUnionParam:
return vt.GetInterruptResponse()
case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
return vt.GetInterruptResponse()
}
return nil
}
@@ -1586,10 +1713,10 @@ func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetInterruptRe
// Returns a pointer to the underlying variant's property, if present.
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetPrefixPaddingMs() *int64 {
switch vt := u.any.(type) {
case *RealtimeAudioInputTurnDetectionParam:
return paramutil.AddrIfPresent(vt.PrefixPaddingMs)
case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
return paramutil.AddrIfPresent(vt.PrefixPaddingMs)
case *RealtimeAudioInputTurnDetectionUnionParam:
return vt.GetPrefixPaddingMs()
case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
return vt.GetPrefixPaddingMs()
}
return nil
}
@@ -1597,10 +1724,10 @@ func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetPrefixPaddi
// Returns a pointer to the underlying variant's property, if present.
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetSilenceDurationMs() *int64 {
switch vt := u.any.(type) {
case *RealtimeAudioInputTurnDetectionParam:
return paramutil.AddrIfPresent(vt.SilenceDurationMs)
case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
return paramutil.AddrIfPresent(vt.SilenceDurationMs)
case *RealtimeAudioInputTurnDetectionUnionParam:
return vt.GetSilenceDurationMs()
case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
return vt.GetSilenceDurationMs()
}
return nil
}
@@ -1608,21 +1735,21 @@ func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetSilenceDura
// Returns a pointer to the underlying variant's property, if present.
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetThreshold() *float64 {
switch vt := u.any.(type) {
case *RealtimeAudioInputTurnDetectionParam:
return paramutil.AddrIfPresent(vt.Threshold)
case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
return paramutil.AddrIfPresent(vt.Threshold)
case *RealtimeAudioInputTurnDetectionUnionParam:
return vt.GetThreshold()
case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
return vt.GetThreshold()
}
return nil
}
// Returns a pointer to the underlying variant's property, if present.
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetType() *string {
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetEagerness() *string {
switch vt := u.any.(type) {
case *RealtimeAudioInputTurnDetectionParam:
return (*string)(&vt.Type)
case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
return (*string)(&vt.Type)
case *RealtimeAudioInputTurnDetectionUnionParam:
return vt.GetEagerness()
case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
return vt.GetEagerness()
}
return nil
}

View File

@@ -51,15 +51,15 @@ func TestClientSecretNewWithOptionalParams(t *testing.T) {
Model: realtime.AudioTranscriptionModelWhisper1,
Prompt: openai.String("prompt"),
},
TurnDetection: realtime.RealtimeAudioInputTurnDetectionParam{
CreateResponse: openai.Bool(true),
Eagerness: realtime.RealtimeAudioInputTurnDetectionEagernessLow,
IdleTimeoutMs: openai.Int(0),
InterruptResponse: openai.Bool(true),
PrefixPaddingMs: openai.Int(0),
SilenceDurationMs: openai.Int(0),
Threshold: openai.Float(0),
Type: realtime.RealtimeAudioInputTurnDetectionTypeServerVad,
TurnDetection: realtime.RealtimeAudioInputTurnDetectionUnionParam{
OfServerVad: &realtime.RealtimeAudioInputTurnDetectionServerVadParam{
CreateResponse: openai.Bool(true),
IdleTimeoutMs: openai.Int(5000),
InterruptResponse: openai.Bool(true),
PrefixPaddingMs: openai.Int(0),
SilenceDurationMs: openai.Int(0),
Threshold: openai.Float(0),
},
},
},
Output: realtime.RealtimeAudioConfigOutputParam{

View File

@@ -141,6 +141,20 @@ func (r *RealtimeAudioConfigParam) UnmarshalJSON(data []byte) error {
}
type RealtimeAudioConfigInputParam struct {
// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
// set to `null` to turn off, in which case the client must manually trigger model
// response.
//
// Server VAD means that the model will detect the start and end of speech based on
// audio volume and respond at the end of user speech.
//
// Semantic VAD is more advanced and uses a turn detection model (in conjunction
// with VAD) to semantically estimate whether the user has finished speaking, then
// dynamically sets a timeout based on this probability. For example, if user audio
// trails off with "uhhm", the model will score a low probability of turn end and
// wait longer for the user to continue speaking. This can be useful for more
// natural conversations, but may have a higher latency.
TurnDetection RealtimeAudioInputTurnDetectionUnionParam `json:"turn_detection,omitzero"`
// The format of the input audio.
Format RealtimeAudioFormatsUnionParam `json:"format,omitzero"`
// Configuration for input audio noise reduction. This can be set to `null` to turn
@@ -158,17 +172,6 @@ type RealtimeAudioConfigInputParam struct {
// what the model heard. The client can optionally set the language and prompt for
// transcription, these offer additional guidance to the transcription service.
Transcription AudioTranscriptionParam `json:"transcription,omitzero"`
// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
// set to `null` to turn off, in which case the client must manually trigger model
// response. Server VAD means that the model will detect the start and end of
// speech based on audio volume and respond at the end of user speech. Semantic VAD
// is more advanced and uses a turn detection model (in conjunction with VAD) to
// semantically estimate whether the user has finished speaking, then dynamically
// sets a timeout based on this probability. For example, if user audio trails off
// with "uhhm", the model will score a low probability of turn end and wait longer
// for the user to continue speaking. This can be useful for more natural
// conversations, but may have a higher latency.
TurnDetection RealtimeAudioInputTurnDetectionParam `json:"turn_detection,omitzero"`
paramObj
}
@@ -530,19 +533,126 @@ func init() {
)
}
// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
// set to `null` to turn off, in which case the client must manually trigger model
// response. Server VAD means that the model will detect the start and end of
// speech based on audio volume and respond at the end of user speech. Semantic VAD
// is more advanced and uses a turn detection model (in conjunction with VAD) to
// semantically estimate whether the user has finished speaking, then dynamically
// sets a timeout based on this probability. For example, if user audio trails off
// with "uhhm", the model will score a low probability of turn end and wait longer
// for the user to continue speaking. This can be useful for more natural
// conversations, but may have a higher latency.
type RealtimeAudioInputTurnDetectionParam struct {
// Optional idle timeout after which turn detection will auto-timeout when no
// additional audio is received and emits a `timeout_triggered` event.
// Only one field can be non-zero.
//
// Use [param.IsOmitted] to confirm if a field is set.
type RealtimeAudioInputTurnDetectionUnionParam struct {
OfServerVad *RealtimeAudioInputTurnDetectionServerVadParam `json:",omitzero,inline"`
OfSemanticVad *RealtimeAudioInputTurnDetectionSemanticVadParam `json:",omitzero,inline"`
paramUnion
}
func (u RealtimeAudioInputTurnDetectionUnionParam) MarshalJSON() ([]byte, error) {
return param.MarshalUnion(u, u.OfServerVad, u.OfSemanticVad)
}
func (u *RealtimeAudioInputTurnDetectionUnionParam) UnmarshalJSON(data []byte) error {
return apijson.UnmarshalRoot(data, u)
}
func (u *RealtimeAudioInputTurnDetectionUnionParam) asAny() any {
if !param.IsOmitted(u.OfServerVad) {
return u.OfServerVad
} else if !param.IsOmitted(u.OfSemanticVad) {
return u.OfSemanticVad
}
return nil
}
// Returns a pointer to the underlying variant's property, if present.
func (u RealtimeAudioInputTurnDetectionUnionParam) GetIdleTimeoutMs() *int64 {
if vt := u.OfServerVad; vt != nil && vt.IdleTimeoutMs.Valid() {
return &vt.IdleTimeoutMs.Value
}
return nil
}
// Returns a pointer to the underlying variant's property, if present.
func (u RealtimeAudioInputTurnDetectionUnionParam) GetPrefixPaddingMs() *int64 {
if vt := u.OfServerVad; vt != nil && vt.PrefixPaddingMs.Valid() {
return &vt.PrefixPaddingMs.Value
}
return nil
}
// Returns a pointer to the underlying variant's property, if present.
func (u RealtimeAudioInputTurnDetectionUnionParam) GetSilenceDurationMs() *int64 {
if vt := u.OfServerVad; vt != nil && vt.SilenceDurationMs.Valid() {
return &vt.SilenceDurationMs.Value
}
return nil
}
// Returns a pointer to the underlying variant's property, if present.
func (u RealtimeAudioInputTurnDetectionUnionParam) GetThreshold() *float64 {
if vt := u.OfServerVad; vt != nil && vt.Threshold.Valid() {
return &vt.Threshold.Value
}
return nil
}
// Returns a pointer to the underlying variant's property, if present.
func (u RealtimeAudioInputTurnDetectionUnionParam) GetEagerness() *string {
if vt := u.OfSemanticVad; vt != nil {
return &vt.Eagerness
}
return nil
}
// Returns a pointer to the underlying variant's property, if present.
func (u RealtimeAudioInputTurnDetectionUnionParam) GetType() *string {
if vt := u.OfServerVad; vt != nil {
return (*string)(&vt.Type)
} else if vt := u.OfSemanticVad; vt != nil {
return (*string)(&vt.Type)
}
return nil
}
// Returns a pointer to the underlying variant's property, if present.
func (u RealtimeAudioInputTurnDetectionUnionParam) GetCreateResponse() *bool {
if vt := u.OfServerVad; vt != nil && vt.CreateResponse.Valid() {
return &vt.CreateResponse.Value
} else if vt := u.OfSemanticVad; vt != nil && vt.CreateResponse.Valid() {
return &vt.CreateResponse.Value
}
return nil
}
// Returns a pointer to the underlying variant's property, if present.
func (u RealtimeAudioInputTurnDetectionUnionParam) GetInterruptResponse() *bool {
if vt := u.OfServerVad; vt != nil && vt.InterruptResponse.Valid() {
return &vt.InterruptResponse.Value
} else if vt := u.OfSemanticVad; vt != nil && vt.InterruptResponse.Valid() {
return &vt.InterruptResponse.Value
}
return nil
}
func init() {
apijson.RegisterUnion[RealtimeAudioInputTurnDetectionUnionParam](
"type",
apijson.Discriminator[RealtimeAudioInputTurnDetectionServerVadParam]("server_vad"),
apijson.Discriminator[RealtimeAudioInputTurnDetectionSemanticVadParam]("semantic_vad"),
)
}
// Server-side voice activity detection (VAD) which flips on when user speech is
// detected and off after a period of silence.
//
// The property Type is required.
type RealtimeAudioInputTurnDetectionServerVadParam struct {
// Optional timeout after which a model response will be triggered automatically.
// This is useful for situations in which a long pause from the user is unexpected,
// such as a phone call. The model will effectively prompt the user to continue the
// conversation based on the current context.
//
// The timeout value will be applied after the last model response's audio has
// finished playing, i.e. it's set to the `response.done` time plus audio playback
// duration.
//
// An `input_audio_buffer.timeout_triggered` event (plus events associated with the
// Response) will be emitted when the timeout is reached. Idle timeout is currently
// only supported for `server_vad` mode.
IdleTimeoutMs param.Opt[int64] `json:"idle_timeout_ms,omitzero"`
// Whether or not to automatically generate a response when a VAD stop event
// occurs.
@@ -562,48 +672,60 @@ type RealtimeAudioInputTurnDetectionParam struct {
// defaults to 0.5. A higher threshold will require louder audio to activate the
// model, and thus might perform better in noisy environments.
Threshold param.Opt[float64] `json:"threshold,omitzero"`
// Type of turn detection, `server_vad` to turn on simple Server VAD.
//
// This field can be elided, and will marshal its zero value as "server_vad".
Type constant.ServerVad `json:"type,required"`
paramObj
}
func (r RealtimeAudioInputTurnDetectionServerVadParam) MarshalJSON() (data []byte, err error) {
type shadow RealtimeAudioInputTurnDetectionServerVadParam
return param.MarshalObject(r, (*shadow)(&r))
}
func (r *RealtimeAudioInputTurnDetectionServerVadParam) UnmarshalJSON(data []byte) error {
return apijson.UnmarshalRoot(data, r)
}
// Server-side semantic turn detection which uses a model to determine when the
// user has finished speaking.
//
// The property Type is required.
type RealtimeAudioInputTurnDetectionSemanticVadParam struct {
// Whether or not to automatically generate a response when a VAD stop event
// occurs.
CreateResponse param.Opt[bool] `json:"create_response,omitzero"`
// Whether or not to automatically interrupt any ongoing response with output to
// the default conversation (i.e. `conversation` of `auto`) when a VAD start event
// occurs.
InterruptResponse param.Opt[bool] `json:"interrupt_response,omitzero"`
// Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
// will wait longer for the user to continue speaking, `high` will respond more
// quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
// and `high` have max timeouts of 8s, 4s, and 2s respectively.
//
// Any of "low", "medium", "high", "auto".
Eagerness RealtimeAudioInputTurnDetectionEagerness `json:"eagerness,omitzero"`
// Type of turn detection.
Eagerness string `json:"eagerness,omitzero"`
// Type of turn detection, `semantic_vad` to turn on Semantic VAD.
//
// Any of "server_vad", "semantic_vad".
Type RealtimeAudioInputTurnDetectionType `json:"type,omitzero"`
// This field can be elided, and will marshal its zero value as "semantic_vad".
Type constant.SemanticVad `json:"type,required"`
paramObj
}
func (r RealtimeAudioInputTurnDetectionParam) MarshalJSON() (data []byte, err error) {
type shadow RealtimeAudioInputTurnDetectionParam
func (r RealtimeAudioInputTurnDetectionSemanticVadParam) MarshalJSON() (data []byte, err error) {
type shadow RealtimeAudioInputTurnDetectionSemanticVadParam
return param.MarshalObject(r, (*shadow)(&r))
}
func (r *RealtimeAudioInputTurnDetectionParam) UnmarshalJSON(data []byte) error {
func (r *RealtimeAudioInputTurnDetectionSemanticVadParam) UnmarshalJSON(data []byte) error {
return apijson.UnmarshalRoot(data, r)
}
// Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
// will wait longer for the user to continue speaking, `high` will respond more
// quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
// and `high` have max timeouts of 8s, 4s, and 2s respectively.
type RealtimeAudioInputTurnDetectionEagerness string
const (
RealtimeAudioInputTurnDetectionEagernessLow RealtimeAudioInputTurnDetectionEagerness = "low"
RealtimeAudioInputTurnDetectionEagernessMedium RealtimeAudioInputTurnDetectionEagerness = "medium"
RealtimeAudioInputTurnDetectionEagernessHigh RealtimeAudioInputTurnDetectionEagerness = "high"
RealtimeAudioInputTurnDetectionEagernessAuto RealtimeAudioInputTurnDetectionEagerness = "auto"
)
// Type of turn detection.
type RealtimeAudioInputTurnDetectionType string
const (
RealtimeAudioInputTurnDetectionTypeServerVad RealtimeAudioInputTurnDetectionType = "server_vad"
RealtimeAudioInputTurnDetectionTypeSemanticVad RealtimeAudioInputTurnDetectionType = "semantic_vad"
)
func init() {
apijson.RegisterFieldValidator[RealtimeAudioInputTurnDetectionSemanticVadParam](
"eagerness", "low", "medium", "high", "auto",
)
}
type RealtimeFunctionTool struct {
// The description of the function, including guidance on when and how to call it,
@@ -1264,6 +1386,20 @@ func (r *RealtimeTranscriptionSessionAudioParam) UnmarshalJSON(data []byte) erro
}
type RealtimeTranscriptionSessionAudioInputParam struct {
// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
// set to `null` to turn off, in which case the client must manually trigger model
// response.
//
// Server VAD means that the model will detect the start and end of speech based on
// audio volume and respond at the end of user speech.
//
// Semantic VAD is more advanced and uses a turn detection model (in conjunction
// with VAD) to semantically estimate whether the user has finished speaking, then
// dynamically sets a timeout based on this probability. For example, if user audio
// trails off with "uhhm", the model will score a low probability of turn end and
// wait longer for the user to continue speaking. This can be useful for more
// natural conversations, but may have a higher latency.
TurnDetection RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam `json:"turn_detection,omitzero"`
// The PCM audio format. Only a 24kHz sample rate is supported.
Format RealtimeAudioFormatsUnionParam `json:"format,omitzero"`
// Configuration for input audio noise reduction. This can be set to `null` to turn
@@ -1281,17 +1417,6 @@ type RealtimeTranscriptionSessionAudioInputParam struct {
// what the model heard. The client can optionally set the language and prompt for
// transcription, these offer additional guidance to the transcription service.
Transcription AudioTranscriptionParam `json:"transcription,omitzero"`
// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
// set to `null` to turn off, in which case the client must manually trigger model
// response. Server VAD means that the model will detect the start and end of
// speech based on audio volume and respond at the end of user speech. Semantic VAD
// is more advanced and uses a turn detection model (in conjunction with VAD) to
// semantically estimate whether the user has finished speaking, then dynamically
// sets a timeout based on this probability. For example, if user audio trails off
// with "uhhm", the model will score a low probability of turn end and wait longer
// for the user to continue speaking. This can be useful for more natural
// conversations, but may have a higher latency.
TurnDetection RealtimeTranscriptionSessionAudioInputTurnDetectionParam `json:"turn_detection,omitzero"`
paramObj
}
@@ -1326,19 +1451,126 @@ func (r *RealtimeTranscriptionSessionAudioInputNoiseReductionParam) UnmarshalJSO
return apijson.UnmarshalRoot(data, r)
}
// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
// set to `null` to turn off, in which case the client must manually trigger model
// response. Server VAD means that the model will detect the start and end of
// speech based on audio volume and respond at the end of user speech. Semantic VAD
// is more advanced and uses a turn detection model (in conjunction with VAD) to
// semantically estimate whether the user has finished speaking, then dynamically
// sets a timeout based on this probability. For example, if user audio trails off
// with "uhhm", the model will score a low probability of turn end and wait longer
// for the user to continue speaking. This can be useful for more natural
// conversations, but may have a higher latency.
type RealtimeTranscriptionSessionAudioInputTurnDetectionParam struct {
// Optional idle timeout after which turn detection will auto-timeout when no
// additional audio is received.
// Only one field can be non-zero.
//
// Use [param.IsOmitted] to confirm if a field is set.
type RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam struct {
OfServerVad *RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam `json:",omitzero,inline"`
OfSemanticVad *RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam `json:",omitzero,inline"`
paramUnion
}
func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) MarshalJSON() ([]byte, error) {
return param.MarshalUnion(u, u.OfServerVad, u.OfSemanticVad)
}
func (u *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) UnmarshalJSON(data []byte) error {
return apijson.UnmarshalRoot(data, u)
}
func (u *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) asAny() any {
if !param.IsOmitted(u.OfServerVad) {
return u.OfServerVad
} else if !param.IsOmitted(u.OfSemanticVad) {
return u.OfSemanticVad
}
return nil
}
// Returns a pointer to the underlying variant's property, if present.
func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetIdleTimeoutMs() *int64 {
if vt := u.OfServerVad; vt != nil && vt.IdleTimeoutMs.Valid() {
return &vt.IdleTimeoutMs.Value
}
return nil
}
// Returns a pointer to the underlying variant's property, if present.
func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetPrefixPaddingMs() *int64 {
if vt := u.OfServerVad; vt != nil && vt.PrefixPaddingMs.Valid() {
return &vt.PrefixPaddingMs.Value
}
return nil
}
// Returns a pointer to the underlying variant's property, if present.
func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetSilenceDurationMs() *int64 {
if vt := u.OfServerVad; vt != nil && vt.SilenceDurationMs.Valid() {
return &vt.SilenceDurationMs.Value
}
return nil
}
// Returns a pointer to the underlying variant's property, if present.
func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetThreshold() *float64 {
if vt := u.OfServerVad; vt != nil && vt.Threshold.Valid() {
return &vt.Threshold.Value
}
return nil
}
// Returns a pointer to the underlying variant's property, if present.
func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetEagerness() *string {
if vt := u.OfSemanticVad; vt != nil {
return &vt.Eagerness
}
return nil
}
// Returns a pointer to the underlying variant's property, if present.
func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetType() *string {
if vt := u.OfServerVad; vt != nil {
return (*string)(&vt.Type)
} else if vt := u.OfSemanticVad; vt != nil {
return (*string)(&vt.Type)
}
return nil
}
// Returns a pointer to the underlying variant's property, if present.
func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetCreateResponse() *bool {
if vt := u.OfServerVad; vt != nil && vt.CreateResponse.Valid() {
return &vt.CreateResponse.Value
} else if vt := u.OfSemanticVad; vt != nil && vt.CreateResponse.Valid() {
return &vt.CreateResponse.Value
}
return nil
}
// Returns a pointer to the underlying variant's property, if present.
func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetInterruptResponse() *bool {
if vt := u.OfServerVad; vt != nil && vt.InterruptResponse.Valid() {
return &vt.InterruptResponse.Value
} else if vt := u.OfSemanticVad; vt != nil && vt.InterruptResponse.Valid() {
return &vt.InterruptResponse.Value
}
return nil
}
func init() {
apijson.RegisterUnion[RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam](
"type",
apijson.Discriminator[RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam]("server_vad"),
apijson.Discriminator[RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam]("semantic_vad"),
)
}
// Server-side voice activity detection (VAD) which flips on when user speech is
// detected and off after a period of silence.
//
// The property Type is required.
type RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam struct {
// Optional timeout after which a model response will be triggered automatically.
// This is useful for situations in which a long pause from the user is unexpected,
// such as a phone call. The model will effectively prompt the user to continue the
// conversation based on the current context.
//
// The timeout value will be applied after the last model response's audio has
// finished playing, i.e. it's set to the `response.done` time plus audio playback
// duration.
//
// An `input_audio_buffer.timeout_triggered` event (plus events associated with the
// Response) will be emitted when the timeout is reached. Idle timeout is currently
// only supported for `server_vad` mode.
IdleTimeoutMs param.Opt[int64] `json:"idle_timeout_ms,omitzero"`
// Whether or not to automatically generate a response when a VAD stop event
// occurs.
@@ -1358,46 +1590,60 @@ type RealtimeTranscriptionSessionAudioInputTurnDetectionParam struct {
// defaults to 0.5. A higher threshold will require louder audio to activate the
// model, and thus might perform better in noisy environments.
Threshold param.Opt[float64] `json:"threshold,omitzero"`
// Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
// will wait longer for the user to continue speaking, `high` will respond more
// quickly. `auto` is the default and is equivalent to `medium`.
// Type of turn detection, `server_vad` to turn on simple Server VAD.
//
// Any of "low", "medium", "high", "auto".
Eagerness RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness `json:"eagerness,omitzero"`
// Type of turn detection.
//
// Any of "server_vad", "semantic_vad".
Type RealtimeTranscriptionSessionAudioInputTurnDetectionType `json:"type,omitzero"`
// This field can be elided, and will marshal its zero value as "server_vad".
Type constant.ServerVad `json:"type,required"`
paramObj
}
func (r RealtimeTranscriptionSessionAudioInputTurnDetectionParam) MarshalJSON() (data []byte, err error) {
type shadow RealtimeTranscriptionSessionAudioInputTurnDetectionParam
func (r RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam) MarshalJSON() (data []byte, err error) {
type shadow RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam
return param.MarshalObject(r, (*shadow)(&r))
}
func (r *RealtimeTranscriptionSessionAudioInputTurnDetectionParam) UnmarshalJSON(data []byte) error {
func (r *RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam) UnmarshalJSON(data []byte) error {
return apijson.UnmarshalRoot(data, r)
}
// Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
// will wait longer for the user to continue speaking, `high` will respond more
// quickly. `auto` is the default and is equivalent to `medium`.
type RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness string
// Server-side semantic turn detection which uses a model to determine when the
// user has finished speaking.
//
// The property Type is required.
type RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam struct {
// Whether or not to automatically generate a response when a VAD stop event
// occurs.
CreateResponse param.Opt[bool] `json:"create_response,omitzero"`
// Whether or not to automatically interrupt any ongoing response with output to
// the default conversation (i.e. `conversation` of `auto`) when a VAD start event
// occurs.
InterruptResponse param.Opt[bool] `json:"interrupt_response,omitzero"`
// Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
// will wait longer for the user to continue speaking, `high` will respond more
// quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
// and `high` have max timeouts of 8s, 4s, and 2s respectively.
//
// Any of "low", "medium", "high", "auto".
Eagerness string `json:"eagerness,omitzero"`
// Type of turn detection, `semantic_vad` to turn on Semantic VAD.
//
// This field can be elided, and will marshal its zero value as "semantic_vad".
Type constant.SemanticVad `json:"type,required"`
paramObj
}
const (
RealtimeTranscriptionSessionAudioInputTurnDetectionEagernessLow RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness = "low"
RealtimeTranscriptionSessionAudioInputTurnDetectionEagernessMedium RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness = "medium"
RealtimeTranscriptionSessionAudioInputTurnDetectionEagernessHigh RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness = "high"
RealtimeTranscriptionSessionAudioInputTurnDetectionEagernessAuto RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness = "auto"
)
func (r RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam) MarshalJSON() (data []byte, err error) {
type shadow RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam
return param.MarshalObject(r, (*shadow)(&r))
}
func (r *RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam) UnmarshalJSON(data []byte) error {
return apijson.UnmarshalRoot(data, r)
}
// Type of turn detection.
type RealtimeTranscriptionSessionAudioInputTurnDetectionType string
const (
RealtimeTranscriptionSessionAudioInputTurnDetectionTypeServerVad RealtimeTranscriptionSessionAudioInputTurnDetectionType = "server_vad"
RealtimeTranscriptionSessionAudioInputTurnDetectionTypeSemanticVad RealtimeTranscriptionSessionAudioInputTurnDetectionType = "semantic_vad"
)
func init() {
apijson.RegisterFieldValidator[RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam](
"eagerness", "low", "medium", "high", "auto",
)
}
// Realtime transcription session object configuration.
//

View File

@@ -879,10 +879,10 @@ type Response struct {
TopLogprobs int64 `json:"top_logprobs,nullable"`
// The truncation strategy to use for the model response.
//
// - `auto`: If the context of this response and previous ones exceeds the model's
// context window size, the model will truncate the response to fit the context
// window by dropping input items in the middle of the conversation.
// - `disabled` (default): If a model response will exceed the context window size
// - `auto`: If the input to this Response exceeds the model's context window size,
// the model will truncate the response to fit the context window by dropping
// items from the beginning of the conversation.
// - `disabled` (default): If the input size will exceed the context window size
// for a model, the request will fail with a 400 error.
//
// Any of "auto", "disabled".
@@ -1125,10 +1125,10 @@ const (
// The truncation strategy to use for the model response.
//
// - `auto`: If the context of this response and previous ones exceeds the model's
// context window size, the model will truncate the response to fit the context
// window by dropping input items in the middle of the conversation.
// - `disabled` (default): If a model response will exceed the context window size
// - `auto`: If the input to this Response exceeds the model's context window size,
// the model will truncate the response to fit the context window by dropping
// items from the beginning of the conversation.
// - `disabled` (default): If the input size will exceed the context window size
// for a model, the request will fail with a 400 error.
type ResponseTruncation string
@@ -14285,10 +14285,10 @@ type ResponseNewParams struct {
StreamOptions ResponseNewParamsStreamOptions `json:"stream_options,omitzero"`
// The truncation strategy to use for the model response.
//
// - `auto`: If the context of this response and previous ones exceeds the model's
// context window size, the model will truncate the response to fit the context
// window by dropping input items in the middle of the conversation.
// - `disabled` (default): If a model response will exceed the context window size
// - `auto`: If the input to this Response exceeds the model's context window size,
// the model will truncate the response to fit the context window by dropping
// items from the beginning of the conversation.
// - `disabled` (default): If the input size will exceed the context window size
// for a model, the request will fail with a 400 error.
//
// Any of "auto", "disabled".
@@ -14548,10 +14548,10 @@ func (u ResponseNewParamsToolChoiceUnion) GetName() *string {
// The truncation strategy to use for the model response.
//
// - `auto`: If the context of this response and previous ones exceeds the model's
// context window size, the model will truncate the response to fit the context
// window by dropping input items in the middle of the conversation.
// - `disabled` (default): If a model response will exceed the context window size
// - `auto`: If the input to this Response exceeds the model's context window size,
// the model will truncate the response to fit the context window by dropping
// items from the beginning of the conversation.
// - `disabled` (default): If the input size will exceed the context window size
// for a model, the request will fail with a 400 error.
type ResponseNewParamsTruncation string

View File

@@ -217,6 +217,8 @@ type ScoreModel string // Always "score_mo
type Screenshot string // Always "screenshot"
type Scroll string // Always "scroll"
type Search string // Always "search"
type SemanticVad string // Always "semantic_vad"
type ServerVad string // Always "server_vad"
type SessionCreated string // Always "session.created"
type SessionUpdate string // Always "session.update"
type SessionUpdated string // Always "session.updated"
@@ -610,6 +612,8 @@ func (c ScoreModel) Default() ScoreModel { return "score
func (c Screenshot) Default() Screenshot { return "screenshot" }
func (c Scroll) Default() Scroll { return "scroll" }
func (c Search) Default() Search { return "search" }
func (c SemanticVad) Default() SemanticVad { return "semantic_vad" }
func (c ServerVad) Default() ServerVad { return "server_vad" }
func (c SessionCreated) Default() SessionCreated { return "session.created" }
func (c SessionUpdate) Default() SessionUpdate { return "session.update" }
func (c SessionUpdated) Default() SessionUpdated { return "session.updated" }
@@ -903,6 +907,8 @@ func (c ScoreModel) MarshalJSON() ([]byte, error) { retu
func (c Screenshot) MarshalJSON() ([]byte, error) { return marshalString(c) }
func (c Scroll) MarshalJSON() ([]byte, error) { return marshalString(c) }
func (c Search) MarshalJSON() ([]byte, error) { return marshalString(c) }
func (c SemanticVad) MarshalJSON() ([]byte, error) { return marshalString(c) }
func (c ServerVad) MarshalJSON() ([]byte, error) { return marshalString(c) }
func (c SessionCreated) MarshalJSON() ([]byte, error) { return marshalString(c) }
func (c SessionUpdate) MarshalJSON() ([]byte, error) { return marshalString(c) }
func (c SessionUpdated) MarshalJSON() ([]byte, error) { return marshalString(c) }