mirror of
https://github.com/openai/openai-go.git
synced 2026-03-31 16:47:11 +09:00
chore(api): Minor docs and type updates for realtime
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
configured_endpoints: 106
|
||||
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-16cb18bed32bae8c5840fb39a1bf664026cc40463ad0c487dcb0df1bd3d72db0.yml
|
||||
openapi_spec_hash: 4cb51b22f98dee1a90bc7add82d1d132
|
||||
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-c829f9e7f51d4946dae7b02eb37eb857b538a464cf54c7ced5eff1b1c93e07db.yml
|
||||
openapi_spec_hash: 1b2eaba46b264bcec8831bc496543649
|
||||
config_hash: 930dac3aa861344867e4ac84f037b5df
|
||||
|
||||
4
api.md
4
api.md
@@ -793,7 +793,7 @@ Params Types:
|
||||
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeAudioConfigInputParam">RealtimeAudioConfigInputParam</a>
|
||||
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeAudioConfigOutputParam">RealtimeAudioConfigOutputParam</a>
|
||||
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeAudioFormatsUnionParam">RealtimeAudioFormatsUnionParam</a>
|
||||
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeAudioInputTurnDetectionParam">RealtimeAudioInputTurnDetectionParam</a>
|
||||
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeAudioInputTurnDetectionUnionParam">RealtimeAudioInputTurnDetectionUnionParam</a>
|
||||
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeFunctionToolParam">RealtimeFunctionToolParam</a>
|
||||
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeSessionCreateRequestParam">RealtimeSessionCreateRequestParam</a>
|
||||
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeToolChoiceConfigUnionParam">RealtimeToolChoiceConfigUnionParam</a>
|
||||
@@ -802,7 +802,7 @@ Params Types:
|
||||
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeTracingConfigUnionParam">RealtimeTracingConfigUnionParam</a>
|
||||
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeTranscriptionSessionAudioParam">RealtimeTranscriptionSessionAudioParam</a>
|
||||
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeTranscriptionSessionAudioInputParam">RealtimeTranscriptionSessionAudioInputParam</a>
|
||||
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeTranscriptionSessionAudioInputTurnDetectionParam">RealtimeTranscriptionSessionAudioInputTurnDetectionParam</a>
|
||||
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam">RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam</a>
|
||||
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeTranscriptionSessionCreateRequestParam">RealtimeTranscriptionSessionCreateRequestParam</a>
|
||||
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeTruncationUnionParam">RealtimeTruncationUnionParam</a>
|
||||
- <a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime">realtime</a>.<a href="https://pkg.go.dev/github.com/openai/openai-go/v2/realtime#RealtimeTruncationRetentionRatioParam">RealtimeTruncationRetentionRatioParam</a>
|
||||
|
||||
@@ -8,7 +8,6 @@ import (
|
||||
"net/http"
|
||||
|
||||
"github.com/openai/openai-go/v2/internal/apijson"
|
||||
"github.com/openai/openai-go/v2/internal/paramutil"
|
||||
"github.com/openai/openai-go/v2/internal/requestconfig"
|
||||
"github.com/openai/openai-go/v2/option"
|
||||
"github.com/openai/openai-go/v2/packages/param"
|
||||
@@ -192,15 +191,18 @@ type RealtimeSessionCreateResponseAudioInput struct {
|
||||
Transcription AudioTranscription `json:"transcription"`
|
||||
// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
||||
// set to `null` to turn off, in which case the client must manually trigger model
|
||||
// response. Server VAD means that the model will detect the start and end of
|
||||
// speech based on audio volume and respond at the end of user speech. Semantic VAD
|
||||
// is more advanced and uses a turn detection model (in conjunction with VAD) to
|
||||
// semantically estimate whether the user has finished speaking, then dynamically
|
||||
// sets a timeout based on this probability. For example, if user audio trails off
|
||||
// with "uhhm", the model will score a low probability of turn end and wait longer
|
||||
// for the user to continue speaking. This can be useful for more natural
|
||||
// conversations, but may have a higher latency.
|
||||
TurnDetection RealtimeSessionCreateResponseAudioInputTurnDetection `json:"turn_detection"`
|
||||
// response.
|
||||
//
|
||||
// Server VAD means that the model will detect the start and end of speech based on
|
||||
// audio volume and respond at the end of user speech.
|
||||
//
|
||||
// Semantic VAD is more advanced and uses a turn detection model (in conjunction
|
||||
// with VAD) to semantically estimate whether the user has finished speaking, then
|
||||
// dynamically sets a timeout based on this probability. For example, if user audio
|
||||
// trails off with "uhhm", the model will score a low probability of turn end and
|
||||
// wait longer for the user to continue speaking. This can be useful for more
|
||||
// natural conversations, but may have a higher latency.
|
||||
TurnDetection RealtimeSessionCreateResponseAudioInputTurnDetectionUnion `json:"turn_detection,nullable"`
|
||||
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
|
||||
JSON struct {
|
||||
Format respjson.Field
|
||||
@@ -244,29 +246,118 @@ func (r *RealtimeSessionCreateResponseAudioInputNoiseReduction) UnmarshalJSON(da
|
||||
return apijson.UnmarshalRoot(data, r)
|
||||
}
|
||||
|
||||
// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
||||
// set to `null` to turn off, in which case the client must manually trigger model
|
||||
// response. Server VAD means that the model will detect the start and end of
|
||||
// speech based on audio volume and respond at the end of user speech. Semantic VAD
|
||||
// is more advanced and uses a turn detection model (in conjunction with VAD) to
|
||||
// semantically estimate whether the user has finished speaking, then dynamically
|
||||
// sets a timeout based on this probability. For example, if user audio trails off
|
||||
// with "uhhm", the model will score a low probability of turn end and wait longer
|
||||
// for the user to continue speaking. This can be useful for more natural
|
||||
// conversations, but may have a higher latency.
|
||||
type RealtimeSessionCreateResponseAudioInputTurnDetection struct {
|
||||
// RealtimeSessionCreateResponseAudioInputTurnDetectionUnion contains all possible
|
||||
// properties and values from
|
||||
// [RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad],
|
||||
// [RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad].
|
||||
//
|
||||
// Use the [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion.AsAny] method
|
||||
// to switch on the variant.
|
||||
//
|
||||
// Use the methods beginning with 'As' to cast the union to one of its variants.
|
||||
type RealtimeSessionCreateResponseAudioInputTurnDetectionUnion struct {
|
||||
// Any of "server_vad", "semantic_vad".
|
||||
Type string `json:"type"`
|
||||
CreateResponse bool `json:"create_response"`
|
||||
// This field is from variant
|
||||
// [RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad].
|
||||
IdleTimeoutMs int64 `json:"idle_timeout_ms"`
|
||||
InterruptResponse bool `json:"interrupt_response"`
|
||||
// This field is from variant
|
||||
// [RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad].
|
||||
PrefixPaddingMs int64 `json:"prefix_padding_ms"`
|
||||
// This field is from variant
|
||||
// [RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad].
|
||||
SilenceDurationMs int64 `json:"silence_duration_ms"`
|
||||
// This field is from variant
|
||||
// [RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad].
|
||||
Threshold float64 `json:"threshold"`
|
||||
// This field is from variant
|
||||
// [RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad].
|
||||
Eagerness string `json:"eagerness"`
|
||||
JSON struct {
|
||||
Type respjson.Field
|
||||
CreateResponse respjson.Field
|
||||
IdleTimeoutMs respjson.Field
|
||||
InterruptResponse respjson.Field
|
||||
PrefixPaddingMs respjson.Field
|
||||
SilenceDurationMs respjson.Field
|
||||
Threshold respjson.Field
|
||||
Eagerness respjson.Field
|
||||
raw string
|
||||
} `json:"-"`
|
||||
}
|
||||
|
||||
// anyRealtimeSessionCreateResponseAudioInputTurnDetection is implemented by each
|
||||
// variant of [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion] to add
|
||||
// type safety for the return type of
|
||||
// [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion.AsAny]
|
||||
type anyRealtimeSessionCreateResponseAudioInputTurnDetection interface {
|
||||
implRealtimeSessionCreateResponseAudioInputTurnDetectionUnion()
|
||||
}
|
||||
|
||||
func (RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad) implRealtimeSessionCreateResponseAudioInputTurnDetectionUnion() {
|
||||
}
|
||||
func (RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad) implRealtimeSessionCreateResponseAudioInputTurnDetectionUnion() {
|
||||
}
|
||||
|
||||
// Use the following switch statement to find the correct variant
|
||||
//
|
||||
// switch variant := RealtimeSessionCreateResponseAudioInputTurnDetectionUnion.AsAny().(type) {
|
||||
// case realtime.RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad:
|
||||
// case realtime.RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad:
|
||||
// default:
|
||||
// fmt.Errorf("no variant present")
|
||||
// }
|
||||
func (u RealtimeSessionCreateResponseAudioInputTurnDetectionUnion) AsAny() anyRealtimeSessionCreateResponseAudioInputTurnDetection {
|
||||
switch u.Type {
|
||||
case "server_vad":
|
||||
return u.AsServerVad()
|
||||
case "semantic_vad":
|
||||
return u.AsSemanticVad()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (u RealtimeSessionCreateResponseAudioInputTurnDetectionUnion) AsServerVad() (v RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad) {
|
||||
apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v)
|
||||
return
|
||||
}
|
||||
|
||||
func (u RealtimeSessionCreateResponseAudioInputTurnDetectionUnion) AsSemanticVad() (v RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad) {
|
||||
apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v)
|
||||
return
|
||||
}
|
||||
|
||||
// Returns the unmodified JSON received from the API
|
||||
func (u RealtimeSessionCreateResponseAudioInputTurnDetectionUnion) RawJSON() string {
|
||||
return u.JSON.raw
|
||||
}
|
||||
|
||||
func (r *RealtimeSessionCreateResponseAudioInputTurnDetectionUnion) UnmarshalJSON(data []byte) error {
|
||||
return apijson.UnmarshalRoot(data, r)
|
||||
}
|
||||
|
||||
// Server-side voice activity detection (VAD) which flips on when user speech is
|
||||
// detected and off after a period of silence.
|
||||
type RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad struct {
|
||||
// Type of turn detection, `server_vad` to turn on simple Server VAD.
|
||||
Type constant.ServerVad `json:"type,required"`
|
||||
// Whether or not to automatically generate a response when a VAD stop event
|
||||
// occurs.
|
||||
CreateResponse bool `json:"create_response"`
|
||||
// Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
||||
// will wait longer for the user to continue speaking, `high` will respond more
|
||||
// quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
|
||||
// and `high` have max timeouts of 8s, 4s, and 2s respectively.
|
||||
// Optional timeout after which a model response will be triggered automatically.
|
||||
// This is useful for situations in which a long pause from the user is unexpected,
|
||||
// such as a phone call. The model will effectively prompt the user to continue the
|
||||
// conversation based on the current context.
|
||||
//
|
||||
// Any of "low", "medium", "high", "auto".
|
||||
Eagerness string `json:"eagerness"`
|
||||
// Optional idle timeout after which turn detection will auto-timeout when no
|
||||
// additional audio is received and emits a `timeout_triggered` event.
|
||||
// The timeout value will be applied after the last model response's audio has
|
||||
// finished playing, i.e. it's set to the `response.done` time plus audio playback
|
||||
// duration.
|
||||
//
|
||||
// An `input_audio_buffer.timeout_triggered` event (plus events associated with the
|
||||
// Response) will be emitted when the timeout is reached. Idle timeout is currently
|
||||
// only supported for `server_vad` mode.
|
||||
IdleTimeoutMs int64 `json:"idle_timeout_ms,nullable"`
|
||||
// Whether or not to automatically interrupt any ongoing response with output to
|
||||
// the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
||||
@@ -283,28 +374,63 @@ type RealtimeSessionCreateResponseAudioInputTurnDetection struct {
|
||||
// defaults to 0.5. A higher threshold will require louder audio to activate the
|
||||
// model, and thus might perform better in noisy environments.
|
||||
Threshold float64 `json:"threshold"`
|
||||
// Type of turn detection.
|
||||
//
|
||||
// Any of "server_vad", "semantic_vad".
|
||||
Type string `json:"type"`
|
||||
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
|
||||
JSON struct {
|
||||
Type respjson.Field
|
||||
CreateResponse respjson.Field
|
||||
Eagerness respjson.Field
|
||||
IdleTimeoutMs respjson.Field
|
||||
InterruptResponse respjson.Field
|
||||
PrefixPaddingMs respjson.Field
|
||||
SilenceDurationMs respjson.Field
|
||||
Threshold respjson.Field
|
||||
Type respjson.Field
|
||||
ExtraFields map[string]respjson.Field
|
||||
raw string
|
||||
} `json:"-"`
|
||||
}
|
||||
|
||||
// Returns the unmodified JSON received from the API
|
||||
func (r RealtimeSessionCreateResponseAudioInputTurnDetection) RawJSON() string { return r.JSON.raw }
|
||||
func (r *RealtimeSessionCreateResponseAudioInputTurnDetection) UnmarshalJSON(data []byte) error {
|
||||
func (r RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad) RawJSON() string {
|
||||
return r.JSON.raw
|
||||
}
|
||||
func (r *RealtimeSessionCreateResponseAudioInputTurnDetectionServerVad) UnmarshalJSON(data []byte) error {
|
||||
return apijson.UnmarshalRoot(data, r)
|
||||
}
|
||||
|
||||
// Server-side semantic turn detection which uses a model to determine when the
|
||||
// user has finished speaking.
|
||||
type RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad struct {
|
||||
// Type of turn detection, `semantic_vad` to turn on Semantic VAD.
|
||||
Type constant.SemanticVad `json:"type,required"`
|
||||
// Whether or not to automatically generate a response when a VAD stop event
|
||||
// occurs.
|
||||
CreateResponse bool `json:"create_response"`
|
||||
// Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
||||
// will wait longer for the user to continue speaking, `high` will respond more
|
||||
// quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
|
||||
// and `high` have max timeouts of 8s, 4s, and 2s respectively.
|
||||
//
|
||||
// Any of "low", "medium", "high", "auto".
|
||||
Eagerness string `json:"eagerness"`
|
||||
// Whether or not to automatically interrupt any ongoing response with output to
|
||||
// the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
||||
// occurs.
|
||||
InterruptResponse bool `json:"interrupt_response"`
|
||||
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
|
||||
JSON struct {
|
||||
Type respjson.Field
|
||||
CreateResponse respjson.Field
|
||||
Eagerness respjson.Field
|
||||
InterruptResponse respjson.Field
|
||||
ExtraFields map[string]respjson.Field
|
||||
raw string
|
||||
} `json:"-"`
|
||||
}
|
||||
|
||||
// Returns the unmodified JSON received from the API
|
||||
func (r RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad) RawJSON() string {
|
||||
return r.JSON.raw
|
||||
}
|
||||
func (r *RealtimeSessionCreateResponseAudioInputTurnDetectionSemanticVad) UnmarshalJSON(data []byte) error {
|
||||
return apijson.UnmarshalRoot(data, r)
|
||||
}
|
||||
|
||||
@@ -1152,7 +1278,8 @@ type ClientSecretNewResponseSessionUnionAudioInput struct {
|
||||
NoiseReduction ClientSecretNewResponseSessionUnionAudioInputNoiseReduction `json:"noise_reduction"`
|
||||
// This field is from variant [RealtimeSessionCreateResponseAudioInput].
|
||||
Transcription AudioTranscription `json:"transcription"`
|
||||
// This field is a union of [RealtimeSessionCreateResponseAudioInputTurnDetection],
|
||||
// This field is a union of
|
||||
// [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion],
|
||||
// [RealtimeTranscriptionSessionTurnDetection]
|
||||
TurnDetection ClientSecretNewResponseSessionUnionAudioInputTurnDetection `json:"turn_detection"`
|
||||
JSON struct {
|
||||
@@ -1197,31 +1324,27 @@ func (r *ClientSecretNewResponseSessionUnionAudioInputNoiseReduction) UnmarshalJ
|
||||
// For type safety it is recommended to directly use a variant of the
|
||||
// [ClientSecretNewResponseSessionUnion].
|
||||
type ClientSecretNewResponseSessionUnionAudioInputTurnDetection struct {
|
||||
Type string `json:"type"`
|
||||
CreateResponse bool `json:"create_response"`
|
||||
// This field is from variant
|
||||
// [RealtimeSessionCreateResponseAudioInputTurnDetection].
|
||||
CreateResponse bool `json:"create_response"`
|
||||
// This field is from variant
|
||||
// [RealtimeSessionCreateResponseAudioInputTurnDetection].
|
||||
Eagerness string `json:"eagerness"`
|
||||
// This field is from variant
|
||||
// [RealtimeSessionCreateResponseAudioInputTurnDetection].
|
||||
IdleTimeoutMs int64 `json:"idle_timeout_ms"`
|
||||
// This field is from variant
|
||||
// [RealtimeSessionCreateResponseAudioInputTurnDetection].
|
||||
// [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion].
|
||||
IdleTimeoutMs int64 `json:"idle_timeout_ms"`
|
||||
InterruptResponse bool `json:"interrupt_response"`
|
||||
PrefixPaddingMs int64 `json:"prefix_padding_ms"`
|
||||
SilenceDurationMs int64 `json:"silence_duration_ms"`
|
||||
Threshold float64 `json:"threshold"`
|
||||
Type string `json:"type"`
|
||||
JSON struct {
|
||||
// This field is from variant
|
||||
// [RealtimeSessionCreateResponseAudioInputTurnDetectionUnion].
|
||||
Eagerness string `json:"eagerness"`
|
||||
JSON struct {
|
||||
Type respjson.Field
|
||||
CreateResponse respjson.Field
|
||||
Eagerness respjson.Field
|
||||
IdleTimeoutMs respjson.Field
|
||||
InterruptResponse respjson.Field
|
||||
PrefixPaddingMs respjson.Field
|
||||
SilenceDurationMs respjson.Field
|
||||
Threshold respjson.Field
|
||||
Type respjson.Field
|
||||
Eagerness respjson.Field
|
||||
raw string
|
||||
} `json:"-"`
|
||||
}
|
||||
@@ -1518,45 +1641,49 @@ func (u clientSecretNewParamsSessionUnionAudioInput) GetTranscription() *AudioTr
|
||||
func (u clientSecretNewParamsSessionUnionAudioInput) GetTurnDetection() (res clientSecretNewParamsSessionUnionAudioInputTurnDetection) {
|
||||
switch vt := u.any.(type) {
|
||||
case *RealtimeAudioConfigInputParam:
|
||||
res.any = &vt.TurnDetection
|
||||
res.any = vt.TurnDetection
|
||||
case *RealtimeTranscriptionSessionAudioInputParam:
|
||||
res.any = &vt.TurnDetection
|
||||
res.any = vt.TurnDetection
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
// Can have the runtime types [*RealtimeAudioInputTurnDetectionParam],
|
||||
// [*RealtimeTranscriptionSessionAudioInputTurnDetectionParam]
|
||||
// Can have the runtime types [*RealtimeAudioInputTurnDetectionServerVadParam],
|
||||
// [*RealtimeAudioInputTurnDetectionSemanticVadParam],
|
||||
// [*RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam],
|
||||
// [*RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam]
|
||||
type clientSecretNewParamsSessionUnionAudioInputTurnDetection struct{ any }
|
||||
|
||||
// Use the following switch statement to get the type of the union:
|
||||
//
|
||||
// switch u.AsAny().(type) {
|
||||
// case *realtime.RealtimeAudioInputTurnDetectionParam:
|
||||
// case *realtime.RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
|
||||
// case *realtime.RealtimeAudioInputTurnDetectionServerVadParam:
|
||||
// case *realtime.RealtimeAudioInputTurnDetectionSemanticVadParam:
|
||||
// case *realtime.RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam:
|
||||
// case *realtime.RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam:
|
||||
// default:
|
||||
// fmt.Errorf("not present")
|
||||
// }
|
||||
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) AsAny() any { return u.any }
|
||||
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetCreateResponse() *bool {
|
||||
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetType() *string {
|
||||
switch vt := u.any.(type) {
|
||||
case *RealtimeAudioInputTurnDetectionParam:
|
||||
return paramutil.AddrIfPresent(vt.CreateResponse)
|
||||
case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
|
||||
return paramutil.AddrIfPresent(vt.CreateResponse)
|
||||
case *RealtimeAudioInputTurnDetectionUnionParam:
|
||||
return vt.GetType()
|
||||
case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
|
||||
return vt.GetType()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetEagerness() *string {
|
||||
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetCreateResponse() *bool {
|
||||
switch vt := u.any.(type) {
|
||||
case *RealtimeAudioInputTurnDetectionParam:
|
||||
return (*string)(&vt.Eagerness)
|
||||
case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
|
||||
return (*string)(&vt.Eagerness)
|
||||
case *RealtimeAudioInputTurnDetectionUnionParam:
|
||||
return vt.GetCreateResponse()
|
||||
case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
|
||||
return vt.GetCreateResponse()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -1564,10 +1691,10 @@ func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetEagerness()
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetIdleTimeoutMs() *int64 {
|
||||
switch vt := u.any.(type) {
|
||||
case *RealtimeAudioInputTurnDetectionParam:
|
||||
return paramutil.AddrIfPresent(vt.IdleTimeoutMs)
|
||||
case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
|
||||
return paramutil.AddrIfPresent(vt.IdleTimeoutMs)
|
||||
case *RealtimeAudioInputTurnDetectionUnionParam:
|
||||
return vt.GetIdleTimeoutMs()
|
||||
case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
|
||||
return vt.GetIdleTimeoutMs()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -1575,10 +1702,10 @@ func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetIdleTimeout
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetInterruptResponse() *bool {
|
||||
switch vt := u.any.(type) {
|
||||
case *RealtimeAudioInputTurnDetectionParam:
|
||||
return paramutil.AddrIfPresent(vt.InterruptResponse)
|
||||
case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
|
||||
return paramutil.AddrIfPresent(vt.InterruptResponse)
|
||||
case *RealtimeAudioInputTurnDetectionUnionParam:
|
||||
return vt.GetInterruptResponse()
|
||||
case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
|
||||
return vt.GetInterruptResponse()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -1586,10 +1713,10 @@ func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetInterruptRe
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetPrefixPaddingMs() *int64 {
|
||||
switch vt := u.any.(type) {
|
||||
case *RealtimeAudioInputTurnDetectionParam:
|
||||
return paramutil.AddrIfPresent(vt.PrefixPaddingMs)
|
||||
case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
|
||||
return paramutil.AddrIfPresent(vt.PrefixPaddingMs)
|
||||
case *RealtimeAudioInputTurnDetectionUnionParam:
|
||||
return vt.GetPrefixPaddingMs()
|
||||
case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
|
||||
return vt.GetPrefixPaddingMs()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -1597,10 +1724,10 @@ func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetPrefixPaddi
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetSilenceDurationMs() *int64 {
|
||||
switch vt := u.any.(type) {
|
||||
case *RealtimeAudioInputTurnDetectionParam:
|
||||
return paramutil.AddrIfPresent(vt.SilenceDurationMs)
|
||||
case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
|
||||
return paramutil.AddrIfPresent(vt.SilenceDurationMs)
|
||||
case *RealtimeAudioInputTurnDetectionUnionParam:
|
||||
return vt.GetSilenceDurationMs()
|
||||
case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
|
||||
return vt.GetSilenceDurationMs()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -1608,21 +1735,21 @@ func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetSilenceDura
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetThreshold() *float64 {
|
||||
switch vt := u.any.(type) {
|
||||
case *RealtimeAudioInputTurnDetectionParam:
|
||||
return paramutil.AddrIfPresent(vt.Threshold)
|
||||
case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
|
||||
return paramutil.AddrIfPresent(vt.Threshold)
|
||||
case *RealtimeAudioInputTurnDetectionUnionParam:
|
||||
return vt.GetThreshold()
|
||||
case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
|
||||
return vt.GetThreshold()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetType() *string {
|
||||
func (u clientSecretNewParamsSessionUnionAudioInputTurnDetection) GetEagerness() *string {
|
||||
switch vt := u.any.(type) {
|
||||
case *RealtimeAudioInputTurnDetectionParam:
|
||||
return (*string)(&vt.Type)
|
||||
case *RealtimeTranscriptionSessionAudioInputTurnDetectionParam:
|
||||
return (*string)(&vt.Type)
|
||||
case *RealtimeAudioInputTurnDetectionUnionParam:
|
||||
return vt.GetEagerness()
|
||||
case *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam:
|
||||
return vt.GetEagerness()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -51,15 +51,15 @@ func TestClientSecretNewWithOptionalParams(t *testing.T) {
|
||||
Model: realtime.AudioTranscriptionModelWhisper1,
|
||||
Prompt: openai.String("prompt"),
|
||||
},
|
||||
TurnDetection: realtime.RealtimeAudioInputTurnDetectionParam{
|
||||
CreateResponse: openai.Bool(true),
|
||||
Eagerness: realtime.RealtimeAudioInputTurnDetectionEagernessLow,
|
||||
IdleTimeoutMs: openai.Int(0),
|
||||
InterruptResponse: openai.Bool(true),
|
||||
PrefixPaddingMs: openai.Int(0),
|
||||
SilenceDurationMs: openai.Int(0),
|
||||
Threshold: openai.Float(0),
|
||||
Type: realtime.RealtimeAudioInputTurnDetectionTypeServerVad,
|
||||
TurnDetection: realtime.RealtimeAudioInputTurnDetectionUnionParam{
|
||||
OfServerVad: &realtime.RealtimeAudioInputTurnDetectionServerVadParam{
|
||||
CreateResponse: openai.Bool(true),
|
||||
IdleTimeoutMs: openai.Int(5000),
|
||||
InterruptResponse: openai.Bool(true),
|
||||
PrefixPaddingMs: openai.Int(0),
|
||||
SilenceDurationMs: openai.Int(0),
|
||||
Threshold: openai.Float(0),
|
||||
},
|
||||
},
|
||||
},
|
||||
Output: realtime.RealtimeAudioConfigOutputParam{
|
||||
|
||||
@@ -141,6 +141,20 @@ func (r *RealtimeAudioConfigParam) UnmarshalJSON(data []byte) error {
|
||||
}
|
||||
|
||||
type RealtimeAudioConfigInputParam struct {
|
||||
// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
||||
// set to `null` to turn off, in which case the client must manually trigger model
|
||||
// response.
|
||||
//
|
||||
// Server VAD means that the model will detect the start and end of speech based on
|
||||
// audio volume and respond at the end of user speech.
|
||||
//
|
||||
// Semantic VAD is more advanced and uses a turn detection model (in conjunction
|
||||
// with VAD) to semantically estimate whether the user has finished speaking, then
|
||||
// dynamically sets a timeout based on this probability. For example, if user audio
|
||||
// trails off with "uhhm", the model will score a low probability of turn end and
|
||||
// wait longer for the user to continue speaking. This can be useful for more
|
||||
// natural conversations, but may have a higher latency.
|
||||
TurnDetection RealtimeAudioInputTurnDetectionUnionParam `json:"turn_detection,omitzero"`
|
||||
// The format of the input audio.
|
||||
Format RealtimeAudioFormatsUnionParam `json:"format,omitzero"`
|
||||
// Configuration for input audio noise reduction. This can be set to `null` to turn
|
||||
@@ -158,17 +172,6 @@ type RealtimeAudioConfigInputParam struct {
|
||||
// what the model heard. The client can optionally set the language and prompt for
|
||||
// transcription, these offer additional guidance to the transcription service.
|
||||
Transcription AudioTranscriptionParam `json:"transcription,omitzero"`
|
||||
// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
||||
// set to `null` to turn off, in which case the client must manually trigger model
|
||||
// response. Server VAD means that the model will detect the start and end of
|
||||
// speech based on audio volume and respond at the end of user speech. Semantic VAD
|
||||
// is more advanced and uses a turn detection model (in conjunction with VAD) to
|
||||
// semantically estimate whether the user has finished speaking, then dynamically
|
||||
// sets a timeout based on this probability. For example, if user audio trails off
|
||||
// with "uhhm", the model will score a low probability of turn end and wait longer
|
||||
// for the user to continue speaking. This can be useful for more natural
|
||||
// conversations, but may have a higher latency.
|
||||
TurnDetection RealtimeAudioInputTurnDetectionParam `json:"turn_detection,omitzero"`
|
||||
paramObj
|
||||
}
|
||||
|
||||
@@ -530,19 +533,126 @@ func init() {
|
||||
)
|
||||
}
|
||||
|
||||
// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
||||
// set to `null` to turn off, in which case the client must manually trigger model
|
||||
// response. Server VAD means that the model will detect the start and end of
|
||||
// speech based on audio volume and respond at the end of user speech. Semantic VAD
|
||||
// is more advanced and uses a turn detection model (in conjunction with VAD) to
|
||||
// semantically estimate whether the user has finished speaking, then dynamically
|
||||
// sets a timeout based on this probability. For example, if user audio trails off
|
||||
// with "uhhm", the model will score a low probability of turn end and wait longer
|
||||
// for the user to continue speaking. This can be useful for more natural
|
||||
// conversations, but may have a higher latency.
|
||||
type RealtimeAudioInputTurnDetectionParam struct {
|
||||
// Optional idle timeout after which turn detection will auto-timeout when no
|
||||
// additional audio is received and emits a `timeout_triggered` event.
|
||||
// Only one field can be non-zero.
|
||||
//
|
||||
// Use [param.IsOmitted] to confirm if a field is set.
|
||||
type RealtimeAudioInputTurnDetectionUnionParam struct {
|
||||
OfServerVad *RealtimeAudioInputTurnDetectionServerVadParam `json:",omitzero,inline"`
|
||||
OfSemanticVad *RealtimeAudioInputTurnDetectionSemanticVadParam `json:",omitzero,inline"`
|
||||
paramUnion
|
||||
}
|
||||
|
||||
func (u RealtimeAudioInputTurnDetectionUnionParam) MarshalJSON() ([]byte, error) {
|
||||
return param.MarshalUnion(u, u.OfServerVad, u.OfSemanticVad)
|
||||
}
|
||||
func (u *RealtimeAudioInputTurnDetectionUnionParam) UnmarshalJSON(data []byte) error {
|
||||
return apijson.UnmarshalRoot(data, u)
|
||||
}
|
||||
|
||||
func (u *RealtimeAudioInputTurnDetectionUnionParam) asAny() any {
|
||||
if !param.IsOmitted(u.OfServerVad) {
|
||||
return u.OfServerVad
|
||||
} else if !param.IsOmitted(u.OfSemanticVad) {
|
||||
return u.OfSemanticVad
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u RealtimeAudioInputTurnDetectionUnionParam) GetIdleTimeoutMs() *int64 {
|
||||
if vt := u.OfServerVad; vt != nil && vt.IdleTimeoutMs.Valid() {
|
||||
return &vt.IdleTimeoutMs.Value
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u RealtimeAudioInputTurnDetectionUnionParam) GetPrefixPaddingMs() *int64 {
|
||||
if vt := u.OfServerVad; vt != nil && vt.PrefixPaddingMs.Valid() {
|
||||
return &vt.PrefixPaddingMs.Value
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u RealtimeAudioInputTurnDetectionUnionParam) GetSilenceDurationMs() *int64 {
|
||||
if vt := u.OfServerVad; vt != nil && vt.SilenceDurationMs.Valid() {
|
||||
return &vt.SilenceDurationMs.Value
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u RealtimeAudioInputTurnDetectionUnionParam) GetThreshold() *float64 {
|
||||
if vt := u.OfServerVad; vt != nil && vt.Threshold.Valid() {
|
||||
return &vt.Threshold.Value
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u RealtimeAudioInputTurnDetectionUnionParam) GetEagerness() *string {
|
||||
if vt := u.OfSemanticVad; vt != nil {
|
||||
return &vt.Eagerness
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u RealtimeAudioInputTurnDetectionUnionParam) GetType() *string {
|
||||
if vt := u.OfServerVad; vt != nil {
|
||||
return (*string)(&vt.Type)
|
||||
} else if vt := u.OfSemanticVad; vt != nil {
|
||||
return (*string)(&vt.Type)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u RealtimeAudioInputTurnDetectionUnionParam) GetCreateResponse() *bool {
|
||||
if vt := u.OfServerVad; vt != nil && vt.CreateResponse.Valid() {
|
||||
return &vt.CreateResponse.Value
|
||||
} else if vt := u.OfSemanticVad; vt != nil && vt.CreateResponse.Valid() {
|
||||
return &vt.CreateResponse.Value
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u RealtimeAudioInputTurnDetectionUnionParam) GetInterruptResponse() *bool {
|
||||
if vt := u.OfServerVad; vt != nil && vt.InterruptResponse.Valid() {
|
||||
return &vt.InterruptResponse.Value
|
||||
} else if vt := u.OfSemanticVad; vt != nil && vt.InterruptResponse.Valid() {
|
||||
return &vt.InterruptResponse.Value
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
apijson.RegisterUnion[RealtimeAudioInputTurnDetectionUnionParam](
|
||||
"type",
|
||||
apijson.Discriminator[RealtimeAudioInputTurnDetectionServerVadParam]("server_vad"),
|
||||
apijson.Discriminator[RealtimeAudioInputTurnDetectionSemanticVadParam]("semantic_vad"),
|
||||
)
|
||||
}
|
||||
|
||||
// Server-side voice activity detection (VAD) which flips on when user speech is
|
||||
// detected and off after a period of silence.
|
||||
//
|
||||
// The property Type is required.
|
||||
type RealtimeAudioInputTurnDetectionServerVadParam struct {
|
||||
// Optional timeout after which a model response will be triggered automatically.
|
||||
// This is useful for situations in which a long pause from the user is unexpected,
|
||||
// such as a phone call. The model will effectively prompt the user to continue the
|
||||
// conversation based on the current context.
|
||||
//
|
||||
// The timeout value will be applied after the last model response's audio has
|
||||
// finished playing, i.e. it's set to the `response.done` time plus audio playback
|
||||
// duration.
|
||||
//
|
||||
// An `input_audio_buffer.timeout_triggered` event (plus events associated with the
|
||||
// Response) will be emitted when the timeout is reached. Idle timeout is currently
|
||||
// only supported for `server_vad` mode.
|
||||
IdleTimeoutMs param.Opt[int64] `json:"idle_timeout_ms,omitzero"`
|
||||
// Whether or not to automatically generate a response when a VAD stop event
|
||||
// occurs.
|
||||
@@ -562,48 +672,60 @@ type RealtimeAudioInputTurnDetectionParam struct {
|
||||
// defaults to 0.5. A higher threshold will require louder audio to activate the
|
||||
// model, and thus might perform better in noisy environments.
|
||||
Threshold param.Opt[float64] `json:"threshold,omitzero"`
|
||||
// Type of turn detection, `server_vad` to turn on simple Server VAD.
|
||||
//
|
||||
// This field can be elided, and will marshal its zero value as "server_vad".
|
||||
Type constant.ServerVad `json:"type,required"`
|
||||
paramObj
|
||||
}
|
||||
|
||||
func (r RealtimeAudioInputTurnDetectionServerVadParam) MarshalJSON() (data []byte, err error) {
|
||||
type shadow RealtimeAudioInputTurnDetectionServerVadParam
|
||||
return param.MarshalObject(r, (*shadow)(&r))
|
||||
}
|
||||
func (r *RealtimeAudioInputTurnDetectionServerVadParam) UnmarshalJSON(data []byte) error {
|
||||
return apijson.UnmarshalRoot(data, r)
|
||||
}
|
||||
|
||||
// Server-side semantic turn detection which uses a model to determine when the
|
||||
// user has finished speaking.
|
||||
//
|
||||
// The property Type is required.
|
||||
type RealtimeAudioInputTurnDetectionSemanticVadParam struct {
|
||||
// Whether or not to automatically generate a response when a VAD stop event
|
||||
// occurs.
|
||||
CreateResponse param.Opt[bool] `json:"create_response,omitzero"`
|
||||
// Whether or not to automatically interrupt any ongoing response with output to
|
||||
// the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
||||
// occurs.
|
||||
InterruptResponse param.Opt[bool] `json:"interrupt_response,omitzero"`
|
||||
// Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
||||
// will wait longer for the user to continue speaking, `high` will respond more
|
||||
// quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
|
||||
// and `high` have max timeouts of 8s, 4s, and 2s respectively.
|
||||
//
|
||||
// Any of "low", "medium", "high", "auto".
|
||||
Eagerness RealtimeAudioInputTurnDetectionEagerness `json:"eagerness,omitzero"`
|
||||
// Type of turn detection.
|
||||
Eagerness string `json:"eagerness,omitzero"`
|
||||
// Type of turn detection, `semantic_vad` to turn on Semantic VAD.
|
||||
//
|
||||
// Any of "server_vad", "semantic_vad".
|
||||
Type RealtimeAudioInputTurnDetectionType `json:"type,omitzero"`
|
||||
// This field can be elided, and will marshal its zero value as "semantic_vad".
|
||||
Type constant.SemanticVad `json:"type,required"`
|
||||
paramObj
|
||||
}
|
||||
|
||||
func (r RealtimeAudioInputTurnDetectionParam) MarshalJSON() (data []byte, err error) {
|
||||
type shadow RealtimeAudioInputTurnDetectionParam
|
||||
func (r RealtimeAudioInputTurnDetectionSemanticVadParam) MarshalJSON() (data []byte, err error) {
|
||||
type shadow RealtimeAudioInputTurnDetectionSemanticVadParam
|
||||
return param.MarshalObject(r, (*shadow)(&r))
|
||||
}
|
||||
func (r *RealtimeAudioInputTurnDetectionParam) UnmarshalJSON(data []byte) error {
|
||||
func (r *RealtimeAudioInputTurnDetectionSemanticVadParam) UnmarshalJSON(data []byte) error {
|
||||
return apijson.UnmarshalRoot(data, r)
|
||||
}
|
||||
|
||||
// Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
||||
// will wait longer for the user to continue speaking, `high` will respond more
|
||||
// quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
|
||||
// and `high` have max timeouts of 8s, 4s, and 2s respectively.
|
||||
type RealtimeAudioInputTurnDetectionEagerness string
|
||||
|
||||
const (
|
||||
RealtimeAudioInputTurnDetectionEagernessLow RealtimeAudioInputTurnDetectionEagerness = "low"
|
||||
RealtimeAudioInputTurnDetectionEagernessMedium RealtimeAudioInputTurnDetectionEagerness = "medium"
|
||||
RealtimeAudioInputTurnDetectionEagernessHigh RealtimeAudioInputTurnDetectionEagerness = "high"
|
||||
RealtimeAudioInputTurnDetectionEagernessAuto RealtimeAudioInputTurnDetectionEagerness = "auto"
|
||||
)
|
||||
|
||||
// Type of turn detection.
|
||||
type RealtimeAudioInputTurnDetectionType string
|
||||
|
||||
const (
|
||||
RealtimeAudioInputTurnDetectionTypeServerVad RealtimeAudioInputTurnDetectionType = "server_vad"
|
||||
RealtimeAudioInputTurnDetectionTypeSemanticVad RealtimeAudioInputTurnDetectionType = "semantic_vad"
|
||||
)
|
||||
func init() {
|
||||
apijson.RegisterFieldValidator[RealtimeAudioInputTurnDetectionSemanticVadParam](
|
||||
"eagerness", "low", "medium", "high", "auto",
|
||||
)
|
||||
}
|
||||
|
||||
type RealtimeFunctionTool struct {
|
||||
// The description of the function, including guidance on when and how to call it,
|
||||
@@ -1264,6 +1386,20 @@ func (r *RealtimeTranscriptionSessionAudioParam) UnmarshalJSON(data []byte) erro
|
||||
}
|
||||
|
||||
type RealtimeTranscriptionSessionAudioInputParam struct {
|
||||
// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
||||
// set to `null` to turn off, in which case the client must manually trigger model
|
||||
// response.
|
||||
//
|
||||
// Server VAD means that the model will detect the start and end of speech based on
|
||||
// audio volume and respond at the end of user speech.
|
||||
//
|
||||
// Semantic VAD is more advanced and uses a turn detection model (in conjunction
|
||||
// with VAD) to semantically estimate whether the user has finished speaking, then
|
||||
// dynamically sets a timeout based on this probability. For example, if user audio
|
||||
// trails off with "uhhm", the model will score a low probability of turn end and
|
||||
// wait longer for the user to continue speaking. This can be useful for more
|
||||
// natural conversations, but may have a higher latency.
|
||||
TurnDetection RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam `json:"turn_detection,omitzero"`
|
||||
// The PCM audio format. Only a 24kHz sample rate is supported.
|
||||
Format RealtimeAudioFormatsUnionParam `json:"format,omitzero"`
|
||||
// Configuration for input audio noise reduction. This can be set to `null` to turn
|
||||
@@ -1281,17 +1417,6 @@ type RealtimeTranscriptionSessionAudioInputParam struct {
|
||||
// what the model heard. The client can optionally set the language and prompt for
|
||||
// transcription, these offer additional guidance to the transcription service.
|
||||
Transcription AudioTranscriptionParam `json:"transcription,omitzero"`
|
||||
// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
||||
// set to `null` to turn off, in which case the client must manually trigger model
|
||||
// response. Server VAD means that the model will detect the start and end of
|
||||
// speech based on audio volume and respond at the end of user speech. Semantic VAD
|
||||
// is more advanced and uses a turn detection model (in conjunction with VAD) to
|
||||
// semantically estimate whether the user has finished speaking, then dynamically
|
||||
// sets a timeout based on this probability. For example, if user audio trails off
|
||||
// with "uhhm", the model will score a low probability of turn end and wait longer
|
||||
// for the user to continue speaking. This can be useful for more natural
|
||||
// conversations, but may have a higher latency.
|
||||
TurnDetection RealtimeTranscriptionSessionAudioInputTurnDetectionParam `json:"turn_detection,omitzero"`
|
||||
paramObj
|
||||
}
|
||||
|
||||
@@ -1326,19 +1451,126 @@ func (r *RealtimeTranscriptionSessionAudioInputNoiseReductionParam) UnmarshalJSO
|
||||
return apijson.UnmarshalRoot(data, r)
|
||||
}
|
||||
|
||||
// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
|
||||
// set to `null` to turn off, in which case the client must manually trigger model
|
||||
// response. Server VAD means that the model will detect the start and end of
|
||||
// speech based on audio volume and respond at the end of user speech. Semantic VAD
|
||||
// is more advanced and uses a turn detection model (in conjunction with VAD) to
|
||||
// semantically estimate whether the user has finished speaking, then dynamically
|
||||
// sets a timeout based on this probability. For example, if user audio trails off
|
||||
// with "uhhm", the model will score a low probability of turn end and wait longer
|
||||
// for the user to continue speaking. This can be useful for more natural
|
||||
// conversations, but may have a higher latency.
|
||||
type RealtimeTranscriptionSessionAudioInputTurnDetectionParam struct {
|
||||
// Optional idle timeout after which turn detection will auto-timeout when no
|
||||
// additional audio is received.
|
||||
// Only one field can be non-zero.
|
||||
//
|
||||
// Use [param.IsOmitted] to confirm if a field is set.
|
||||
type RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam struct {
|
||||
OfServerVad *RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam `json:",omitzero,inline"`
|
||||
OfSemanticVad *RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam `json:",omitzero,inline"`
|
||||
paramUnion
|
||||
}
|
||||
|
||||
func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) MarshalJSON() ([]byte, error) {
|
||||
return param.MarshalUnion(u, u.OfServerVad, u.OfSemanticVad)
|
||||
}
|
||||
func (u *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) UnmarshalJSON(data []byte) error {
|
||||
return apijson.UnmarshalRoot(data, u)
|
||||
}
|
||||
|
||||
func (u *RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) asAny() any {
|
||||
if !param.IsOmitted(u.OfServerVad) {
|
||||
return u.OfServerVad
|
||||
} else if !param.IsOmitted(u.OfSemanticVad) {
|
||||
return u.OfSemanticVad
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetIdleTimeoutMs() *int64 {
|
||||
if vt := u.OfServerVad; vt != nil && vt.IdleTimeoutMs.Valid() {
|
||||
return &vt.IdleTimeoutMs.Value
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetPrefixPaddingMs() *int64 {
|
||||
if vt := u.OfServerVad; vt != nil && vt.PrefixPaddingMs.Valid() {
|
||||
return &vt.PrefixPaddingMs.Value
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetSilenceDurationMs() *int64 {
|
||||
if vt := u.OfServerVad; vt != nil && vt.SilenceDurationMs.Valid() {
|
||||
return &vt.SilenceDurationMs.Value
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetThreshold() *float64 {
|
||||
if vt := u.OfServerVad; vt != nil && vt.Threshold.Valid() {
|
||||
return &vt.Threshold.Value
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetEagerness() *string {
|
||||
if vt := u.OfSemanticVad; vt != nil {
|
||||
return &vt.Eagerness
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetType() *string {
|
||||
if vt := u.OfServerVad; vt != nil {
|
||||
return (*string)(&vt.Type)
|
||||
} else if vt := u.OfSemanticVad; vt != nil {
|
||||
return (*string)(&vt.Type)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetCreateResponse() *bool {
|
||||
if vt := u.OfServerVad; vt != nil && vt.CreateResponse.Valid() {
|
||||
return &vt.CreateResponse.Value
|
||||
} else if vt := u.OfSemanticVad; vt != nil && vt.CreateResponse.Valid() {
|
||||
return &vt.CreateResponse.Value
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Returns a pointer to the underlying variant's property, if present.
|
||||
func (u RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam) GetInterruptResponse() *bool {
|
||||
if vt := u.OfServerVad; vt != nil && vt.InterruptResponse.Valid() {
|
||||
return &vt.InterruptResponse.Value
|
||||
} else if vt := u.OfSemanticVad; vt != nil && vt.InterruptResponse.Valid() {
|
||||
return &vt.InterruptResponse.Value
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
apijson.RegisterUnion[RealtimeTranscriptionSessionAudioInputTurnDetectionUnionParam](
|
||||
"type",
|
||||
apijson.Discriminator[RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam]("server_vad"),
|
||||
apijson.Discriminator[RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam]("semantic_vad"),
|
||||
)
|
||||
}
|
||||
|
||||
// Server-side voice activity detection (VAD) which flips on when user speech is
|
||||
// detected and off after a period of silence.
|
||||
//
|
||||
// The property Type is required.
|
||||
type RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam struct {
|
||||
// Optional timeout after which a model response will be triggered automatically.
|
||||
// This is useful for situations in which a long pause from the user is unexpected,
|
||||
// such as a phone call. The model will effectively prompt the user to continue the
|
||||
// conversation based on the current context.
|
||||
//
|
||||
// The timeout value will be applied after the last model response's audio has
|
||||
// finished playing, i.e. it's set to the `response.done` time plus audio playback
|
||||
// duration.
|
||||
//
|
||||
// An `input_audio_buffer.timeout_triggered` event (plus events associated with the
|
||||
// Response) will be emitted when the timeout is reached. Idle timeout is currently
|
||||
// only supported for `server_vad` mode.
|
||||
IdleTimeoutMs param.Opt[int64] `json:"idle_timeout_ms,omitzero"`
|
||||
// Whether or not to automatically generate a response when a VAD stop event
|
||||
// occurs.
|
||||
@@ -1358,46 +1590,60 @@ type RealtimeTranscriptionSessionAudioInputTurnDetectionParam struct {
|
||||
// defaults to 0.5. A higher threshold will require louder audio to activate the
|
||||
// model, and thus might perform better in noisy environments.
|
||||
Threshold param.Opt[float64] `json:"threshold,omitzero"`
|
||||
// Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
||||
// will wait longer for the user to continue speaking, `high` will respond more
|
||||
// quickly. `auto` is the default and is equivalent to `medium`.
|
||||
// Type of turn detection, `server_vad` to turn on simple Server VAD.
|
||||
//
|
||||
// Any of "low", "medium", "high", "auto".
|
||||
Eagerness RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness `json:"eagerness,omitzero"`
|
||||
// Type of turn detection.
|
||||
//
|
||||
// Any of "server_vad", "semantic_vad".
|
||||
Type RealtimeTranscriptionSessionAudioInputTurnDetectionType `json:"type,omitzero"`
|
||||
// This field can be elided, and will marshal its zero value as "server_vad".
|
||||
Type constant.ServerVad `json:"type,required"`
|
||||
paramObj
|
||||
}
|
||||
|
||||
func (r RealtimeTranscriptionSessionAudioInputTurnDetectionParam) MarshalJSON() (data []byte, err error) {
|
||||
type shadow RealtimeTranscriptionSessionAudioInputTurnDetectionParam
|
||||
func (r RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam) MarshalJSON() (data []byte, err error) {
|
||||
type shadow RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam
|
||||
return param.MarshalObject(r, (*shadow)(&r))
|
||||
}
|
||||
func (r *RealtimeTranscriptionSessionAudioInputTurnDetectionParam) UnmarshalJSON(data []byte) error {
|
||||
func (r *RealtimeTranscriptionSessionAudioInputTurnDetectionServerVadParam) UnmarshalJSON(data []byte) error {
|
||||
return apijson.UnmarshalRoot(data, r)
|
||||
}
|
||||
|
||||
// Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
||||
// will wait longer for the user to continue speaking, `high` will respond more
|
||||
// quickly. `auto` is the default and is equivalent to `medium`.
|
||||
type RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness string
|
||||
// Server-side semantic turn detection which uses a model to determine when the
|
||||
// user has finished speaking.
|
||||
//
|
||||
// The property Type is required.
|
||||
type RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam struct {
|
||||
// Whether or not to automatically generate a response when a VAD stop event
|
||||
// occurs.
|
||||
CreateResponse param.Opt[bool] `json:"create_response,omitzero"`
|
||||
// Whether or not to automatically interrupt any ongoing response with output to
|
||||
// the default conversation (i.e. `conversation` of `auto`) when a VAD start event
|
||||
// occurs.
|
||||
InterruptResponse param.Opt[bool] `json:"interrupt_response,omitzero"`
|
||||
// Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
|
||||
// will wait longer for the user to continue speaking, `high` will respond more
|
||||
// quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
|
||||
// and `high` have max timeouts of 8s, 4s, and 2s respectively.
|
||||
//
|
||||
// Any of "low", "medium", "high", "auto".
|
||||
Eagerness string `json:"eagerness,omitzero"`
|
||||
// Type of turn detection, `semantic_vad` to turn on Semantic VAD.
|
||||
//
|
||||
// This field can be elided, and will marshal its zero value as "semantic_vad".
|
||||
Type constant.SemanticVad `json:"type,required"`
|
||||
paramObj
|
||||
}
|
||||
|
||||
const (
|
||||
RealtimeTranscriptionSessionAudioInputTurnDetectionEagernessLow RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness = "low"
|
||||
RealtimeTranscriptionSessionAudioInputTurnDetectionEagernessMedium RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness = "medium"
|
||||
RealtimeTranscriptionSessionAudioInputTurnDetectionEagernessHigh RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness = "high"
|
||||
RealtimeTranscriptionSessionAudioInputTurnDetectionEagernessAuto RealtimeTranscriptionSessionAudioInputTurnDetectionEagerness = "auto"
|
||||
)
|
||||
func (r RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam) MarshalJSON() (data []byte, err error) {
|
||||
type shadow RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam
|
||||
return param.MarshalObject(r, (*shadow)(&r))
|
||||
}
|
||||
func (r *RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam) UnmarshalJSON(data []byte) error {
|
||||
return apijson.UnmarshalRoot(data, r)
|
||||
}
|
||||
|
||||
// Type of turn detection.
|
||||
type RealtimeTranscriptionSessionAudioInputTurnDetectionType string
|
||||
|
||||
const (
|
||||
RealtimeTranscriptionSessionAudioInputTurnDetectionTypeServerVad RealtimeTranscriptionSessionAudioInputTurnDetectionType = "server_vad"
|
||||
RealtimeTranscriptionSessionAudioInputTurnDetectionTypeSemanticVad RealtimeTranscriptionSessionAudioInputTurnDetectionType = "semantic_vad"
|
||||
)
|
||||
func init() {
|
||||
apijson.RegisterFieldValidator[RealtimeTranscriptionSessionAudioInputTurnDetectionSemanticVadParam](
|
||||
"eagerness", "low", "medium", "high", "auto",
|
||||
)
|
||||
}
|
||||
|
||||
// Realtime transcription session object configuration.
|
||||
//
|
||||
|
||||
@@ -879,10 +879,10 @@ type Response struct {
|
||||
TopLogprobs int64 `json:"top_logprobs,nullable"`
|
||||
// The truncation strategy to use for the model response.
|
||||
//
|
||||
// - `auto`: If the context of this response and previous ones exceeds the model's
|
||||
// context window size, the model will truncate the response to fit the context
|
||||
// window by dropping input items in the middle of the conversation.
|
||||
// - `disabled` (default): If a model response will exceed the context window size
|
||||
// - `auto`: If the input to this Response exceeds the model's context window size,
|
||||
// the model will truncate the response to fit the context window by dropping
|
||||
// items from the beginning of the conversation.
|
||||
// - `disabled` (default): If the input size will exceed the context window size
|
||||
// for a model, the request will fail with a 400 error.
|
||||
//
|
||||
// Any of "auto", "disabled".
|
||||
@@ -1125,10 +1125,10 @@ const (
|
||||
|
||||
// The truncation strategy to use for the model response.
|
||||
//
|
||||
// - `auto`: If the context of this response and previous ones exceeds the model's
|
||||
// context window size, the model will truncate the response to fit the context
|
||||
// window by dropping input items in the middle of the conversation.
|
||||
// - `disabled` (default): If a model response will exceed the context window size
|
||||
// - `auto`: If the input to this Response exceeds the model's context window size,
|
||||
// the model will truncate the response to fit the context window by dropping
|
||||
// items from the beginning of the conversation.
|
||||
// - `disabled` (default): If the input size will exceed the context window size
|
||||
// for a model, the request will fail with a 400 error.
|
||||
type ResponseTruncation string
|
||||
|
||||
@@ -14285,10 +14285,10 @@ type ResponseNewParams struct {
|
||||
StreamOptions ResponseNewParamsStreamOptions `json:"stream_options,omitzero"`
|
||||
// The truncation strategy to use for the model response.
|
||||
//
|
||||
// - `auto`: If the context of this response and previous ones exceeds the model's
|
||||
// context window size, the model will truncate the response to fit the context
|
||||
// window by dropping input items in the middle of the conversation.
|
||||
// - `disabled` (default): If a model response will exceed the context window size
|
||||
// - `auto`: If the input to this Response exceeds the model's context window size,
|
||||
// the model will truncate the response to fit the context window by dropping
|
||||
// items from the beginning of the conversation.
|
||||
// - `disabled` (default): If the input size will exceed the context window size
|
||||
// for a model, the request will fail with a 400 error.
|
||||
//
|
||||
// Any of "auto", "disabled".
|
||||
@@ -14548,10 +14548,10 @@ func (u ResponseNewParamsToolChoiceUnion) GetName() *string {
|
||||
|
||||
// The truncation strategy to use for the model response.
|
||||
//
|
||||
// - `auto`: If the context of this response and previous ones exceeds the model's
|
||||
// context window size, the model will truncate the response to fit the context
|
||||
// window by dropping input items in the middle of the conversation.
|
||||
// - `disabled` (default): If a model response will exceed the context window size
|
||||
// - `auto`: If the input to this Response exceeds the model's context window size,
|
||||
// the model will truncate the response to fit the context window by dropping
|
||||
// items from the beginning of the conversation.
|
||||
// - `disabled` (default): If the input size will exceed the context window size
|
||||
// for a model, the request will fail with a 400 error.
|
||||
type ResponseNewParamsTruncation string
|
||||
|
||||
|
||||
@@ -217,6 +217,8 @@ type ScoreModel string // Always "score_mo
|
||||
type Screenshot string // Always "screenshot"
|
||||
type Scroll string // Always "scroll"
|
||||
type Search string // Always "search"
|
||||
type SemanticVad string // Always "semantic_vad"
|
||||
type ServerVad string // Always "server_vad"
|
||||
type SessionCreated string // Always "session.created"
|
||||
type SessionUpdate string // Always "session.update"
|
||||
type SessionUpdated string // Always "session.updated"
|
||||
@@ -610,6 +612,8 @@ func (c ScoreModel) Default() ScoreModel { return "score
|
||||
func (c Screenshot) Default() Screenshot { return "screenshot" }
|
||||
func (c Scroll) Default() Scroll { return "scroll" }
|
||||
func (c Search) Default() Search { return "search" }
|
||||
func (c SemanticVad) Default() SemanticVad { return "semantic_vad" }
|
||||
func (c ServerVad) Default() ServerVad { return "server_vad" }
|
||||
func (c SessionCreated) Default() SessionCreated { return "session.created" }
|
||||
func (c SessionUpdate) Default() SessionUpdate { return "session.update" }
|
||||
func (c SessionUpdated) Default() SessionUpdated { return "session.updated" }
|
||||
@@ -903,6 +907,8 @@ func (c ScoreModel) MarshalJSON() ([]byte, error) { retu
|
||||
func (c Screenshot) MarshalJSON() ([]byte, error) { return marshalString(c) }
|
||||
func (c Scroll) MarshalJSON() ([]byte, error) { return marshalString(c) }
|
||||
func (c Search) MarshalJSON() ([]byte, error) { return marshalString(c) }
|
||||
func (c SemanticVad) MarshalJSON() ([]byte, error) { return marshalString(c) }
|
||||
func (c ServerVad) MarshalJSON() ([]byte, error) { return marshalString(c) }
|
||||
func (c SessionCreated) MarshalJSON() ([]byte, error) { return marshalString(c) }
|
||||
func (c SessionUpdate) MarshalJSON() ([]byte, error) { return marshalString(c) }
|
||||
func (c SessionUpdated) MarshalJSON() ([]byte, error) { return marshalString(c) }
|
||||
|
||||
Reference in New Issue
Block a user