mirror of
https://github.com/openai/openai-go.git
synced 2026-04-01 00:57:11 +09:00
951 lines
38 KiB
Go
951 lines
38 KiB
Go
// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
|
|
|
|
package openai
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"io"
|
|
"mime/multipart"
|
|
"net/http"
|
|
"slices"
|
|
|
|
"github.com/openai/openai-go/v3/internal/apiform"
|
|
"github.com/openai/openai-go/v3/internal/apijson"
|
|
"github.com/openai/openai-go/v3/internal/requestconfig"
|
|
"github.com/openai/openai-go/v3/option"
|
|
"github.com/openai/openai-go/v3/packages/param"
|
|
"github.com/openai/openai-go/v3/packages/respjson"
|
|
"github.com/openai/openai-go/v3/packages/ssestream"
|
|
"github.com/openai/openai-go/v3/shared/constant"
|
|
)
|
|
|
|
// Turn audio into text or text into audio.
|
|
//
|
|
// AudioTranscriptionService contains methods and other services that help with
|
|
// interacting with the openai API.
|
|
//
|
|
// Note, unlike clients, this service does not read variables from the environment
|
|
// automatically. You should not instantiate this service directly, and instead use
|
|
// the [NewAudioTranscriptionService] method instead.
|
|
type AudioTranscriptionService struct {
|
|
Options []option.RequestOption
|
|
}
|
|
|
|
// NewAudioTranscriptionService generates a new service that applies the given
|
|
// options to each request. These options are applied after the parent client's
|
|
// options (if there is one), and before any request-specific options.
|
|
func NewAudioTranscriptionService(opts ...option.RequestOption) (r AudioTranscriptionService) {
|
|
r = AudioTranscriptionService{}
|
|
r.Options = opts
|
|
return
|
|
}
|
|
|
|
// Transcribes audio into the input language.
|
|
//
|
|
// Returns a transcription object in `json`, `diarized_json`, or `verbose_json`
|
|
// format, or a stream of transcript events.
|
|
func (r *AudioTranscriptionService) New(ctx context.Context, body AudioTranscriptionNewParams, opts ...option.RequestOption) (res *AudioTranscriptionNewResponseUnion, err error) {
|
|
opts = slices.Concat(r.Options, opts)
|
|
path := "audio/transcriptions"
|
|
err = requestconfig.ExecuteNewRequest(ctx, http.MethodPost, path, body, &res, opts...)
|
|
return res, err
|
|
}
|
|
|
|
// Transcribes audio into the input language.
|
|
//
|
|
// Returns a transcription object in `json`, `diarized_json`, or `verbose_json`
|
|
// format, or a stream of transcript events.
|
|
func (r *AudioTranscriptionService) NewStreaming(ctx context.Context, body AudioTranscriptionNewParams, opts ...option.RequestOption) (stream *ssestream.Stream[TranscriptionStreamEventUnion]) {
|
|
var (
|
|
raw *http.Response
|
|
err error
|
|
)
|
|
opts = slices.Concat(r.Options, opts)
|
|
body.SetExtraFields(map[string]any{
|
|
"stream": "true",
|
|
})
|
|
path := "audio/transcriptions"
|
|
err = requestconfig.ExecuteNewRequest(ctx, http.MethodPost, path, body, &raw, opts...)
|
|
return ssestream.NewStream[TranscriptionStreamEventUnion](ssestream.NewDecoder(raw), err)
|
|
}
|
|
|
|
// Represents a transcription response returned by model, based on the provided
|
|
// input.
|
|
type Transcription struct {
|
|
// The transcribed text.
|
|
Text string `json:"text" api:"required"`
|
|
// The log probabilities of the tokens in the transcription. Only returned with the
|
|
// models `gpt-4o-transcribe` and `gpt-4o-mini-transcribe` if `logprobs` is added
|
|
// to the `include` array.
|
|
Logprobs []TranscriptionLogprob `json:"logprobs"`
|
|
// Token usage statistics for the request.
|
|
Usage TranscriptionUsageUnion `json:"usage"`
|
|
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
|
|
JSON struct {
|
|
Text respjson.Field
|
|
Logprobs respjson.Field
|
|
Usage respjson.Field
|
|
ExtraFields map[string]respjson.Field
|
|
raw string
|
|
} `json:"-"`
|
|
}
|
|
|
|
// Returns the unmodified JSON received from the API
|
|
func (r Transcription) RawJSON() string { return r.JSON.raw }
|
|
func (r *Transcription) UnmarshalJSON(data []byte) error {
|
|
return apijson.UnmarshalRoot(data, r)
|
|
}
|
|
|
|
type TranscriptionLogprob struct {
|
|
// The token in the transcription.
|
|
Token string `json:"token"`
|
|
// The bytes of the token.
|
|
Bytes []float64 `json:"bytes"`
|
|
// The log probability of the token.
|
|
Logprob float64 `json:"logprob"`
|
|
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
|
|
JSON struct {
|
|
Token respjson.Field
|
|
Bytes respjson.Field
|
|
Logprob respjson.Field
|
|
ExtraFields map[string]respjson.Field
|
|
raw string
|
|
} `json:"-"`
|
|
}
|
|
|
|
// Returns the unmodified JSON received from the API
|
|
func (r TranscriptionLogprob) RawJSON() string { return r.JSON.raw }
|
|
func (r *TranscriptionLogprob) UnmarshalJSON(data []byte) error {
|
|
return apijson.UnmarshalRoot(data, r)
|
|
}
|
|
|
|
// TranscriptionUsageUnion contains all possible properties and values from
|
|
// [TranscriptionUsageTokens], [TranscriptionUsageDuration].
|
|
//
|
|
// Use the [TranscriptionUsageUnion.AsAny] method to switch on the variant.
|
|
//
|
|
// Use the methods beginning with 'As' to cast the union to one of its variants.
|
|
type TranscriptionUsageUnion struct {
|
|
// This field is from variant [TranscriptionUsageTokens].
|
|
InputTokens int64 `json:"input_tokens"`
|
|
// This field is from variant [TranscriptionUsageTokens].
|
|
OutputTokens int64 `json:"output_tokens"`
|
|
// This field is from variant [TranscriptionUsageTokens].
|
|
TotalTokens int64 `json:"total_tokens"`
|
|
// Any of "tokens", "duration".
|
|
Type string `json:"type"`
|
|
// This field is from variant [TranscriptionUsageTokens].
|
|
InputTokenDetails TranscriptionUsageTokensInputTokenDetails `json:"input_token_details"`
|
|
// This field is from variant [TranscriptionUsageDuration].
|
|
Seconds float64 `json:"seconds"`
|
|
JSON struct {
|
|
InputTokens respjson.Field
|
|
OutputTokens respjson.Field
|
|
TotalTokens respjson.Field
|
|
Type respjson.Field
|
|
InputTokenDetails respjson.Field
|
|
Seconds respjson.Field
|
|
raw string
|
|
} `json:"-"`
|
|
}
|
|
|
|
// anyTranscriptionUsage is implemented by each variant of
|
|
// [TranscriptionUsageUnion] to add type safety for the return type of
|
|
// [TranscriptionUsageUnion.AsAny]
|
|
type anyTranscriptionUsage interface {
|
|
implTranscriptionUsageUnion()
|
|
}
|
|
|
|
func (TranscriptionUsageTokens) implTranscriptionUsageUnion() {}
|
|
func (TranscriptionUsageDuration) implTranscriptionUsageUnion() {}
|
|
|
|
// Use the following switch statement to find the correct variant
|
|
//
|
|
// switch variant := TranscriptionUsageUnion.AsAny().(type) {
|
|
// case openai.TranscriptionUsageTokens:
|
|
// case openai.TranscriptionUsageDuration:
|
|
// default:
|
|
// fmt.Errorf("no variant present")
|
|
// }
|
|
func (u TranscriptionUsageUnion) AsAny() anyTranscriptionUsage {
|
|
switch u.Type {
|
|
case "tokens":
|
|
return u.AsTokens()
|
|
case "duration":
|
|
return u.AsDuration()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (u TranscriptionUsageUnion) AsTokens() (v TranscriptionUsageTokens) {
|
|
apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v)
|
|
return
|
|
}
|
|
|
|
func (u TranscriptionUsageUnion) AsDuration() (v TranscriptionUsageDuration) {
|
|
apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v)
|
|
return
|
|
}
|
|
|
|
// Returns the unmodified JSON received from the API
|
|
func (u TranscriptionUsageUnion) RawJSON() string { return u.JSON.raw }
|
|
|
|
func (r *TranscriptionUsageUnion) UnmarshalJSON(data []byte) error {
|
|
return apijson.UnmarshalRoot(data, r)
|
|
}
|
|
|
|
// Usage statistics for models billed by token usage.
|
|
type TranscriptionUsageTokens struct {
|
|
// Number of input tokens billed for this request.
|
|
InputTokens int64 `json:"input_tokens" api:"required"`
|
|
// Number of output tokens generated.
|
|
OutputTokens int64 `json:"output_tokens" api:"required"`
|
|
// Total number of tokens used (input + output).
|
|
TotalTokens int64 `json:"total_tokens" api:"required"`
|
|
// The type of the usage object. Always `tokens` for this variant.
|
|
Type constant.Tokens `json:"type" default:"tokens"`
|
|
// Details about the input tokens billed for this request.
|
|
InputTokenDetails TranscriptionUsageTokensInputTokenDetails `json:"input_token_details"`
|
|
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
|
|
JSON struct {
|
|
InputTokens respjson.Field
|
|
OutputTokens respjson.Field
|
|
TotalTokens respjson.Field
|
|
Type respjson.Field
|
|
InputTokenDetails respjson.Field
|
|
ExtraFields map[string]respjson.Field
|
|
raw string
|
|
} `json:"-"`
|
|
}
|
|
|
|
// Returns the unmodified JSON received from the API
|
|
func (r TranscriptionUsageTokens) RawJSON() string { return r.JSON.raw }
|
|
func (r *TranscriptionUsageTokens) UnmarshalJSON(data []byte) error {
|
|
return apijson.UnmarshalRoot(data, r)
|
|
}
|
|
|
|
// Details about the input tokens billed for this request.
|
|
type TranscriptionUsageTokensInputTokenDetails struct {
|
|
// Number of audio tokens billed for this request.
|
|
AudioTokens int64 `json:"audio_tokens"`
|
|
// Number of text tokens billed for this request.
|
|
TextTokens int64 `json:"text_tokens"`
|
|
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
|
|
JSON struct {
|
|
AudioTokens respjson.Field
|
|
TextTokens respjson.Field
|
|
ExtraFields map[string]respjson.Field
|
|
raw string
|
|
} `json:"-"`
|
|
}
|
|
|
|
// Returns the unmodified JSON received from the API
|
|
func (r TranscriptionUsageTokensInputTokenDetails) RawJSON() string { return r.JSON.raw }
|
|
func (r *TranscriptionUsageTokensInputTokenDetails) UnmarshalJSON(data []byte) error {
|
|
return apijson.UnmarshalRoot(data, r)
|
|
}
|
|
|
|
// Usage statistics for models billed by audio input duration.
|
|
type TranscriptionUsageDuration struct {
|
|
// Duration of the input audio in seconds.
|
|
Seconds float64 `json:"seconds" api:"required"`
|
|
// The type of the usage object. Always `duration` for this variant.
|
|
Type constant.Duration `json:"type" default:"duration"`
|
|
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
|
|
JSON struct {
|
|
Seconds respjson.Field
|
|
Type respjson.Field
|
|
ExtraFields map[string]respjson.Field
|
|
raw string
|
|
} `json:"-"`
|
|
}
|
|
|
|
// Returns the unmodified JSON received from the API
|
|
func (r TranscriptionUsageDuration) RawJSON() string { return r.JSON.raw }
|
|
func (r *TranscriptionUsageDuration) UnmarshalJSON(data []byte) error {
|
|
return apijson.UnmarshalRoot(data, r)
|
|
}
|
|
|
|
type TranscriptionInclude string
|
|
|
|
const (
|
|
TranscriptionIncludeLogprobs TranscriptionInclude = "logprobs"
|
|
)
|
|
|
|
type TranscriptionSegment struct {
|
|
// Unique identifier of the segment.
|
|
ID int64 `json:"id" api:"required"`
|
|
// Average logprob of the segment. If the value is lower than -1, consider the
|
|
// logprobs failed.
|
|
AvgLogprob float64 `json:"avg_logprob" api:"required"`
|
|
// Compression ratio of the segment. If the value is greater than 2.4, consider the
|
|
// compression failed.
|
|
CompressionRatio float64 `json:"compression_ratio" api:"required"`
|
|
// End time of the segment in seconds.
|
|
End float64 `json:"end" api:"required"`
|
|
// Probability of no speech in the segment. If the value is higher than 1.0 and the
|
|
// `avg_logprob` is below -1, consider this segment silent.
|
|
NoSpeechProb float64 `json:"no_speech_prob" api:"required"`
|
|
// Seek offset of the segment.
|
|
Seek int64 `json:"seek" api:"required"`
|
|
// Start time of the segment in seconds.
|
|
Start float64 `json:"start" api:"required"`
|
|
// Temperature parameter used for generating the segment.
|
|
Temperature float64 `json:"temperature" api:"required"`
|
|
// Text content of the segment.
|
|
Text string `json:"text" api:"required"`
|
|
// Array of token IDs for the text content.
|
|
Tokens []int64 `json:"tokens" api:"required"`
|
|
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
|
|
JSON struct {
|
|
ID respjson.Field
|
|
AvgLogprob respjson.Field
|
|
CompressionRatio respjson.Field
|
|
End respjson.Field
|
|
NoSpeechProb respjson.Field
|
|
Seek respjson.Field
|
|
Start respjson.Field
|
|
Temperature respjson.Field
|
|
Text respjson.Field
|
|
Tokens respjson.Field
|
|
ExtraFields map[string]respjson.Field
|
|
raw string
|
|
} `json:"-"`
|
|
}
|
|
|
|
// Returns the unmodified JSON received from the API
|
|
func (r TranscriptionSegment) RawJSON() string { return r.JSON.raw }
|
|
func (r *TranscriptionSegment) UnmarshalJSON(data []byte) error {
|
|
return apijson.UnmarshalRoot(data, r)
|
|
}
|
|
|
|
// TranscriptionStreamEventUnion contains all possible properties and values from
|
|
// [TranscriptionTextSegmentEvent], [TranscriptionTextDeltaEvent],
|
|
// [TranscriptionTextDoneEvent].
|
|
//
|
|
// Use the [TranscriptionStreamEventUnion.AsAny] method to switch on the variant.
|
|
//
|
|
// Use the methods beginning with 'As' to cast the union to one of its variants.
|
|
type TranscriptionStreamEventUnion struct {
|
|
// This field is from variant [TranscriptionTextSegmentEvent].
|
|
ID string `json:"id"`
|
|
// This field is from variant [TranscriptionTextSegmentEvent].
|
|
End float64 `json:"end"`
|
|
// This field is from variant [TranscriptionTextSegmentEvent].
|
|
Speaker string `json:"speaker"`
|
|
// This field is from variant [TranscriptionTextSegmentEvent].
|
|
Start float64 `json:"start"`
|
|
Text string `json:"text"`
|
|
// Any of "transcript.text.segment", "transcript.text.delta",
|
|
// "transcript.text.done".
|
|
Type string `json:"type"`
|
|
// This field is from variant [TranscriptionTextDeltaEvent].
|
|
Delta string `json:"delta"`
|
|
// This field is a union of [[]TranscriptionTextDeltaEventLogprob],
|
|
// [[]TranscriptionTextDoneEventLogprob]
|
|
Logprobs TranscriptionStreamEventUnionLogprobs `json:"logprobs"`
|
|
// This field is from variant [TranscriptionTextDeltaEvent].
|
|
SegmentID string `json:"segment_id"`
|
|
// This field is from variant [TranscriptionTextDoneEvent].
|
|
Usage TranscriptionTextDoneEventUsage `json:"usage"`
|
|
JSON struct {
|
|
ID respjson.Field
|
|
End respjson.Field
|
|
Speaker respjson.Field
|
|
Start respjson.Field
|
|
Text respjson.Field
|
|
Type respjson.Field
|
|
Delta respjson.Field
|
|
Logprobs respjson.Field
|
|
SegmentID respjson.Field
|
|
Usage respjson.Field
|
|
raw string
|
|
} `json:"-"`
|
|
}
|
|
|
|
// anyTranscriptionStreamEvent is implemented by each variant of
|
|
// [TranscriptionStreamEventUnion] to add type safety for the return type of
|
|
// [TranscriptionStreamEventUnion.AsAny]
|
|
type anyTranscriptionStreamEvent interface {
|
|
implTranscriptionStreamEventUnion()
|
|
}
|
|
|
|
func (TranscriptionTextSegmentEvent) implTranscriptionStreamEventUnion() {}
|
|
func (TranscriptionTextDeltaEvent) implTranscriptionStreamEventUnion() {}
|
|
func (TranscriptionTextDoneEvent) implTranscriptionStreamEventUnion() {}
|
|
|
|
// Use the following switch statement to find the correct variant
|
|
//
|
|
// switch variant := TranscriptionStreamEventUnion.AsAny().(type) {
|
|
// case openai.TranscriptionTextSegmentEvent:
|
|
// case openai.TranscriptionTextDeltaEvent:
|
|
// case openai.TranscriptionTextDoneEvent:
|
|
// default:
|
|
// fmt.Errorf("no variant present")
|
|
// }
|
|
func (u TranscriptionStreamEventUnion) AsAny() anyTranscriptionStreamEvent {
|
|
switch u.Type {
|
|
case "transcript.text.segment":
|
|
return u.AsTranscriptTextSegment()
|
|
case "transcript.text.delta":
|
|
return u.AsTranscriptTextDelta()
|
|
case "transcript.text.done":
|
|
return u.AsTranscriptTextDone()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (u TranscriptionStreamEventUnion) AsTranscriptTextSegment() (v TranscriptionTextSegmentEvent) {
|
|
apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v)
|
|
return
|
|
}
|
|
|
|
func (u TranscriptionStreamEventUnion) AsTranscriptTextDelta() (v TranscriptionTextDeltaEvent) {
|
|
apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v)
|
|
return
|
|
}
|
|
|
|
func (u TranscriptionStreamEventUnion) AsTranscriptTextDone() (v TranscriptionTextDoneEvent) {
|
|
apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v)
|
|
return
|
|
}
|
|
|
|
// Returns the unmodified JSON received from the API
|
|
func (u TranscriptionStreamEventUnion) RawJSON() string { return u.JSON.raw }
|
|
|
|
func (r *TranscriptionStreamEventUnion) UnmarshalJSON(data []byte) error {
|
|
return apijson.UnmarshalRoot(data, r)
|
|
}
|
|
|
|
// TranscriptionStreamEventUnionLogprobs is an implicit subunion of
|
|
// [TranscriptionStreamEventUnion]. TranscriptionStreamEventUnionLogprobs provides
|
|
// convenient access to the sub-properties of the union.
|
|
//
|
|
// For type safety it is recommended to directly use a variant of the
|
|
// [TranscriptionStreamEventUnion].
|
|
//
|
|
// If the underlying value is not a json object, one of the following properties
|
|
// will be valid: OfTranscriptionTextDeltaEventLogprobs
|
|
// OfTranscriptionTextDoneEventLogprobs]
|
|
type TranscriptionStreamEventUnionLogprobs struct {
|
|
// This field will be present if the value is a
|
|
// [[]TranscriptionTextDeltaEventLogprob] instead of an object.
|
|
OfTranscriptionTextDeltaEventLogprobs []TranscriptionTextDeltaEventLogprob `json:",inline"`
|
|
// This field will be present if the value is a
|
|
// [[]TranscriptionTextDoneEventLogprob] instead of an object.
|
|
OfTranscriptionTextDoneEventLogprobs []TranscriptionTextDoneEventLogprob `json:",inline"`
|
|
JSON struct {
|
|
OfTranscriptionTextDeltaEventLogprobs respjson.Field
|
|
OfTranscriptionTextDoneEventLogprobs respjson.Field
|
|
raw string
|
|
} `json:"-"`
|
|
}
|
|
|
|
func (r *TranscriptionStreamEventUnionLogprobs) UnmarshalJSON(data []byte) error {
|
|
return apijson.UnmarshalRoot(data, r)
|
|
}
|
|
|
|
// Emitted when there is an additional text delta. This is also the first event
|
|
// emitted when the transcription starts. Only emitted when you
|
|
// [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
|
|
// with the `Stream` parameter set to `true`.
|
|
type TranscriptionTextDeltaEvent struct {
|
|
// The text delta that was additionally transcribed.
|
|
Delta string `json:"delta" api:"required"`
|
|
// The type of the event. Always `transcript.text.delta`.
|
|
Type constant.TranscriptTextDelta `json:"type" default:"transcript.text.delta"`
|
|
// The log probabilities of the delta. Only included if you
|
|
// [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
|
|
// with the `include[]` parameter set to `logprobs`.
|
|
Logprobs []TranscriptionTextDeltaEventLogprob `json:"logprobs"`
|
|
// Identifier of the diarized segment that this delta belongs to. Only present when
|
|
// using `gpt-4o-transcribe-diarize`.
|
|
SegmentID string `json:"segment_id"`
|
|
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
|
|
JSON struct {
|
|
Delta respjson.Field
|
|
Type respjson.Field
|
|
Logprobs respjson.Field
|
|
SegmentID respjson.Field
|
|
ExtraFields map[string]respjson.Field
|
|
raw string
|
|
} `json:"-"`
|
|
}
|
|
|
|
// Returns the unmodified JSON received from the API
|
|
func (r TranscriptionTextDeltaEvent) RawJSON() string { return r.JSON.raw }
|
|
func (r *TranscriptionTextDeltaEvent) UnmarshalJSON(data []byte) error {
|
|
return apijson.UnmarshalRoot(data, r)
|
|
}
|
|
|
|
type TranscriptionTextDeltaEventLogprob struct {
|
|
// The token that was used to generate the log probability.
|
|
Token string `json:"token"`
|
|
// The bytes that were used to generate the log probability.
|
|
Bytes []int64 `json:"bytes"`
|
|
// The log probability of the token.
|
|
Logprob float64 `json:"logprob"`
|
|
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
|
|
JSON struct {
|
|
Token respjson.Field
|
|
Bytes respjson.Field
|
|
Logprob respjson.Field
|
|
ExtraFields map[string]respjson.Field
|
|
raw string
|
|
} `json:"-"`
|
|
}
|
|
|
|
// Returns the unmodified JSON received from the API
|
|
func (r TranscriptionTextDeltaEventLogprob) RawJSON() string { return r.JSON.raw }
|
|
func (r *TranscriptionTextDeltaEventLogprob) UnmarshalJSON(data []byte) error {
|
|
return apijson.UnmarshalRoot(data, r)
|
|
}
|
|
|
|
// Emitted when the transcription is complete. Contains the complete transcription
|
|
// text. Only emitted when you
|
|
// [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
|
|
// with the `Stream` parameter set to `true`.
|
|
type TranscriptionTextDoneEvent struct {
|
|
// The text that was transcribed.
|
|
Text string `json:"text" api:"required"`
|
|
// The type of the event. Always `transcript.text.done`.
|
|
Type constant.TranscriptTextDone `json:"type" default:"transcript.text.done"`
|
|
// The log probabilities of the individual tokens in the transcription. Only
|
|
// included if you
|
|
// [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
|
|
// with the `include[]` parameter set to `logprobs`.
|
|
Logprobs []TranscriptionTextDoneEventLogprob `json:"logprobs"`
|
|
// Usage statistics for models billed by token usage.
|
|
Usage TranscriptionTextDoneEventUsage `json:"usage"`
|
|
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
|
|
JSON struct {
|
|
Text respjson.Field
|
|
Type respjson.Field
|
|
Logprobs respjson.Field
|
|
Usage respjson.Field
|
|
ExtraFields map[string]respjson.Field
|
|
raw string
|
|
} `json:"-"`
|
|
}
|
|
|
|
// Returns the unmodified JSON received from the API
|
|
func (r TranscriptionTextDoneEvent) RawJSON() string { return r.JSON.raw }
|
|
func (r *TranscriptionTextDoneEvent) UnmarshalJSON(data []byte) error {
|
|
return apijson.UnmarshalRoot(data, r)
|
|
}
|
|
|
|
type TranscriptionTextDoneEventLogprob struct {
|
|
// The token that was used to generate the log probability.
|
|
Token string `json:"token"`
|
|
// The bytes that were used to generate the log probability.
|
|
Bytes []int64 `json:"bytes"`
|
|
// The log probability of the token.
|
|
Logprob float64 `json:"logprob"`
|
|
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
|
|
JSON struct {
|
|
Token respjson.Field
|
|
Bytes respjson.Field
|
|
Logprob respjson.Field
|
|
ExtraFields map[string]respjson.Field
|
|
raw string
|
|
} `json:"-"`
|
|
}
|
|
|
|
// Returns the unmodified JSON received from the API
|
|
func (r TranscriptionTextDoneEventLogprob) RawJSON() string { return r.JSON.raw }
|
|
func (r *TranscriptionTextDoneEventLogprob) UnmarshalJSON(data []byte) error {
|
|
return apijson.UnmarshalRoot(data, r)
|
|
}
|
|
|
|
// Usage statistics for models billed by token usage.
|
|
type TranscriptionTextDoneEventUsage struct {
|
|
// Number of input tokens billed for this request.
|
|
InputTokens int64 `json:"input_tokens" api:"required"`
|
|
// Number of output tokens generated.
|
|
OutputTokens int64 `json:"output_tokens" api:"required"`
|
|
// Total number of tokens used (input + output).
|
|
TotalTokens int64 `json:"total_tokens" api:"required"`
|
|
// The type of the usage object. Always `tokens` for this variant.
|
|
Type constant.Tokens `json:"type" default:"tokens"`
|
|
// Details about the input tokens billed for this request.
|
|
InputTokenDetails TranscriptionTextDoneEventUsageInputTokenDetails `json:"input_token_details"`
|
|
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
|
|
JSON struct {
|
|
InputTokens respjson.Field
|
|
OutputTokens respjson.Field
|
|
TotalTokens respjson.Field
|
|
Type respjson.Field
|
|
InputTokenDetails respjson.Field
|
|
ExtraFields map[string]respjson.Field
|
|
raw string
|
|
} `json:"-"`
|
|
}
|
|
|
|
// Returns the unmodified JSON received from the API
|
|
func (r TranscriptionTextDoneEventUsage) RawJSON() string { return r.JSON.raw }
|
|
func (r *TranscriptionTextDoneEventUsage) UnmarshalJSON(data []byte) error {
|
|
return apijson.UnmarshalRoot(data, r)
|
|
}
|
|
|
|
// Details about the input tokens billed for this request.
|
|
type TranscriptionTextDoneEventUsageInputTokenDetails struct {
|
|
// Number of audio tokens billed for this request.
|
|
AudioTokens int64 `json:"audio_tokens"`
|
|
// Number of text tokens billed for this request.
|
|
TextTokens int64 `json:"text_tokens"`
|
|
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
|
|
JSON struct {
|
|
AudioTokens respjson.Field
|
|
TextTokens respjson.Field
|
|
ExtraFields map[string]respjson.Field
|
|
raw string
|
|
} `json:"-"`
|
|
}
|
|
|
|
// Returns the unmodified JSON received from the API
|
|
func (r TranscriptionTextDoneEventUsageInputTokenDetails) RawJSON() string { return r.JSON.raw }
|
|
func (r *TranscriptionTextDoneEventUsageInputTokenDetails) UnmarshalJSON(data []byte) error {
|
|
return apijson.UnmarshalRoot(data, r)
|
|
}
|
|
|
|
// Emitted when a diarized transcription returns a completed segment with speaker
|
|
// information. Only emitted when you
|
|
// [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
|
|
// with `stream` set to `true` and `response_format` set to `diarized_json`.
|
|
type TranscriptionTextSegmentEvent struct {
|
|
// Unique identifier for the segment.
|
|
ID string `json:"id" api:"required"`
|
|
// End timestamp of the segment in seconds.
|
|
End float64 `json:"end" api:"required"`
|
|
// Speaker label for this segment.
|
|
Speaker string `json:"speaker" api:"required"`
|
|
// Start timestamp of the segment in seconds.
|
|
Start float64 `json:"start" api:"required"`
|
|
// Transcript text for this segment.
|
|
Text string `json:"text" api:"required"`
|
|
// The type of the event. Always `transcript.text.segment`.
|
|
Type constant.TranscriptTextSegment `json:"type" default:"transcript.text.segment"`
|
|
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
|
|
JSON struct {
|
|
ID respjson.Field
|
|
End respjson.Field
|
|
Speaker respjson.Field
|
|
Start respjson.Field
|
|
Text respjson.Field
|
|
Type respjson.Field
|
|
ExtraFields map[string]respjson.Field
|
|
raw string
|
|
} `json:"-"`
|
|
}
|
|
|
|
// Returns the unmodified JSON received from the API
|
|
func (r TranscriptionTextSegmentEvent) RawJSON() string { return r.JSON.raw }
|
|
func (r *TranscriptionTextSegmentEvent) UnmarshalJSON(data []byte) error {
|
|
return apijson.UnmarshalRoot(data, r)
|
|
}
|
|
|
|
// Represents a verbose json transcription response returned by model, based on the
|
|
// provided input.
|
|
type TranscriptionVerbose struct {
|
|
// The duration of the input audio.
|
|
Duration float64 `json:"duration" api:"required"`
|
|
// The language of the input audio.
|
|
Language string `json:"language" api:"required"`
|
|
// The transcribed text.
|
|
Text string `json:"text" api:"required"`
|
|
// Segments of the transcribed text and their corresponding details.
|
|
Segments []TranscriptionSegment `json:"segments"`
|
|
// Usage statistics for models billed by audio input duration.
|
|
Usage TranscriptionVerboseUsage `json:"usage"`
|
|
// Extracted words and their corresponding timestamps.
|
|
Words []TranscriptionWord `json:"words"`
|
|
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
|
|
JSON struct {
|
|
Duration respjson.Field
|
|
Language respjson.Field
|
|
Text respjson.Field
|
|
Segments respjson.Field
|
|
Usage respjson.Field
|
|
Words respjson.Field
|
|
ExtraFields map[string]respjson.Field
|
|
raw string
|
|
} `json:"-"`
|
|
}
|
|
|
|
// Returns the unmodified JSON received from the API
|
|
func (r TranscriptionVerbose) RawJSON() string { return r.JSON.raw }
|
|
func (r *TranscriptionVerbose) UnmarshalJSON(data []byte) error {
|
|
return apijson.UnmarshalRoot(data, r)
|
|
}
|
|
|
|
// Usage statistics for models billed by audio input duration.
|
|
type TranscriptionVerboseUsage struct {
|
|
// Duration of the input audio in seconds.
|
|
Seconds float64 `json:"seconds" api:"required"`
|
|
// The type of the usage object. Always `duration` for this variant.
|
|
Type constant.Duration `json:"type" default:"duration"`
|
|
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
|
|
JSON struct {
|
|
Seconds respjson.Field
|
|
Type respjson.Field
|
|
ExtraFields map[string]respjson.Field
|
|
raw string
|
|
} `json:"-"`
|
|
}
|
|
|
|
// Returns the unmodified JSON received from the API
|
|
func (r TranscriptionVerboseUsage) RawJSON() string { return r.JSON.raw }
|
|
func (r *TranscriptionVerboseUsage) UnmarshalJSON(data []byte) error {
|
|
return apijson.UnmarshalRoot(data, r)
|
|
}
|
|
|
|
type TranscriptionWord struct {
|
|
// End time of the word in seconds.
|
|
End float64 `json:"end" api:"required"`
|
|
// Start time of the word in seconds.
|
|
Start float64 `json:"start" api:"required"`
|
|
// The text content of the word.
|
|
Word string `json:"word" api:"required"`
|
|
// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
|
|
JSON struct {
|
|
End respjson.Field
|
|
Start respjson.Field
|
|
Word respjson.Field
|
|
ExtraFields map[string]respjson.Field
|
|
raw string
|
|
} `json:"-"`
|
|
}
|
|
|
|
// Returns the unmodified JSON received from the API
|
|
func (r TranscriptionWord) RawJSON() string { return r.JSON.raw }
|
|
func (r *TranscriptionWord) UnmarshalJSON(data []byte) error {
|
|
return apijson.UnmarshalRoot(data, r)
|
|
}
|
|
|
|
// AudioTranscriptionNewResponseUnion contains all possible properties and values
|
|
// from [Transcription], [TranscriptionVerbose].
|
|
//
|
|
// Use the methods beginning with 'As' to cast the union to one of its variants.
|
|
type AudioTranscriptionNewResponseUnion struct {
|
|
Text string `json:"text"`
|
|
// This field is from variant [Transcription].
|
|
Logprobs []TranscriptionLogprob `json:"logprobs"`
|
|
// This field is a union of [TranscriptionUsageUnion], [TranscriptionVerboseUsage]
|
|
Usage AudioTranscriptionNewResponseUnionUsage `json:"usage"`
|
|
// This field is from variant [TranscriptionVerbose].
|
|
Duration float64 `json:"duration"`
|
|
// This field is from variant [TranscriptionVerbose].
|
|
Language string `json:"language"`
|
|
// This field is from variant [TranscriptionVerbose].
|
|
Segments []TranscriptionSegment `json:"segments"`
|
|
// This field is from variant [TranscriptionVerbose].
|
|
Words []TranscriptionWord `json:"words"`
|
|
JSON struct {
|
|
Text respjson.Field
|
|
Logprobs respjson.Field
|
|
Usage respjson.Field
|
|
Duration respjson.Field
|
|
Language respjson.Field
|
|
Segments respjson.Field
|
|
Words respjson.Field
|
|
raw string
|
|
} `json:"-"`
|
|
}
|
|
|
|
func (u AudioTranscriptionNewResponseUnion) AsTranscription() (v Transcription) {
|
|
apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v)
|
|
return
|
|
}
|
|
|
|
func (u AudioTranscriptionNewResponseUnion) AsTranscriptionVerbose() (v TranscriptionVerbose) {
|
|
apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v)
|
|
return
|
|
}
|
|
|
|
// Returns the unmodified JSON received from the API
|
|
func (u AudioTranscriptionNewResponseUnion) RawJSON() string { return u.JSON.raw }
|
|
|
|
func (r *AudioTranscriptionNewResponseUnion) UnmarshalJSON(data []byte) error {
|
|
return apijson.UnmarshalRoot(data, r)
|
|
}
|
|
|
|
// AudioTranscriptionNewResponseUnionUsage is an implicit subunion of
|
|
// [AudioTranscriptionNewResponseUnion]. AudioTranscriptionNewResponseUnionUsage
|
|
// provides convenient access to the sub-properties of the union.
|
|
//
|
|
// For type safety it is recommended to directly use a variant of the
|
|
// [AudioTranscriptionNewResponseUnion].
|
|
type AudioTranscriptionNewResponseUnionUsage struct {
|
|
// This field is from variant [TranscriptionUsageUnion].
|
|
InputTokens int64 `json:"input_tokens"`
|
|
// This field is from variant [TranscriptionUsageUnion].
|
|
OutputTokens int64 `json:"output_tokens"`
|
|
// This field is from variant [TranscriptionUsageUnion].
|
|
TotalTokens int64 `json:"total_tokens"`
|
|
Type string `json:"type"`
|
|
// This field is from variant [TranscriptionUsageUnion].
|
|
InputTokenDetails TranscriptionUsageTokensInputTokenDetails `json:"input_token_details"`
|
|
Seconds float64 `json:"seconds"`
|
|
JSON struct {
|
|
InputTokens respjson.Field
|
|
OutputTokens respjson.Field
|
|
TotalTokens respjson.Field
|
|
Type respjson.Field
|
|
InputTokenDetails respjson.Field
|
|
Seconds respjson.Field
|
|
raw string
|
|
} `json:"-"`
|
|
}
|
|
|
|
func (r *AudioTranscriptionNewResponseUnionUsage) UnmarshalJSON(data []byte) error {
|
|
return apijson.UnmarshalRoot(data, r)
|
|
}
|
|
|
|
type AudioTranscriptionNewParams struct {
|
|
// The audio file object (not file name) to transcribe, in one of these formats:
|
|
// flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
|
|
File io.Reader `json:"file,omitzero" api:"required" format:"binary"`
|
|
// ID of the model to use. The options are `gpt-4o-transcribe`,
|
|
// `gpt-4o-mini-transcribe`, `gpt-4o-mini-transcribe-2025-12-15`, `whisper-1`
|
|
// (which is powered by our open source Whisper V2 model), and
|
|
// `gpt-4o-transcribe-diarize`.
|
|
Model AudioModel `json:"model,omitzero" api:"required"`
|
|
// The language of the input audio. Supplying the input language in
|
|
// [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
|
|
// format will improve accuracy and latency.
|
|
Language param.Opt[string] `json:"language,omitzero"`
|
|
// An optional text to guide the model's style or continue a previous audio
|
|
// segment. The
|
|
// [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
|
|
// should match the audio language. This field is not supported when using
|
|
// `gpt-4o-transcribe-diarize`.
|
|
Prompt param.Opt[string] `json:"prompt,omitzero"`
|
|
// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
|
|
// output more random, while lower values like 0.2 will make it more focused and
|
|
// deterministic. If set to 0, the model will use
|
|
// [log probability](https://en.wikipedia.org/wiki/Log_probability) to
|
|
// automatically increase the temperature until certain thresholds are hit.
|
|
Temperature param.Opt[float64] `json:"temperature,omitzero"`
|
|
// Controls how the audio is cut into chunks. When set to `"auto"`, the server
|
|
// first normalizes loudness and then uses voice activity detection (VAD) to choose
|
|
// boundaries. `server_vad` object can be provided to tweak VAD detection
|
|
// parameters manually. If unset, the audio is transcribed as a single block.
|
|
// Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
|
|
// seconds.
|
|
ChunkingStrategy AudioTranscriptionNewParamsChunkingStrategyUnion `json:"chunking_strategy,omitzero"`
|
|
// Additional information to include in the transcription response. `logprobs` will
|
|
// return the log probabilities of the tokens in the response to understand the
|
|
// model's confidence in the transcription. `logprobs` only works with
|
|
// response_format set to `json` and only with the models `gpt-4o-transcribe`,
|
|
// `gpt-4o-mini-transcribe`, and `gpt-4o-mini-transcribe-2025-12-15`. This field is
|
|
// not supported when using `gpt-4o-transcribe-diarize`.
|
|
Include []TranscriptionInclude `json:"include,omitzero"`
|
|
// Optional list of speaker names that correspond to the audio samples provided in
|
|
// `known_speaker_references[]`. Each entry should be a short identifier (for
|
|
// example `customer` or `agent`). Up to 4 speakers are supported.
|
|
KnownSpeakerNames []string `json:"known_speaker_names,omitzero"`
|
|
// Optional list of audio samples (as
|
|
// [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
|
|
// that contain known speaker references matching `known_speaker_names[]`. Each
|
|
// sample must be between 2 and 10 seconds, and can use any of the same input audio
|
|
// formats supported by `file`.
|
|
KnownSpeakerReferences []string `json:"known_speaker_references,omitzero"`
|
|
// The format of the output, in one of these options: `json`, `text`, `srt`,
|
|
// `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
|
|
// `gpt-4o-mini-transcribe`, the only supported format is `json`. For
|
|
// `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
|
|
// `diarized_json`, with `diarized_json` required to receive speaker annotations.
|
|
//
|
|
// Any of "json", "text", "srt", "verbose_json", "vtt", "diarized_json".
|
|
ResponseFormat AudioResponseFormat `json:"response_format,omitzero"`
|
|
// The timestamp granularities to populate for this transcription.
|
|
// `response_format` must be set `verbose_json` to use timestamp granularities.
|
|
// Either or both of these options are supported: `word`, or `segment`. Note: There
|
|
// is no additional latency for segment timestamps, but generating word timestamps
|
|
// incurs additional latency. This option is not available for
|
|
// `gpt-4o-transcribe-diarize`.
|
|
//
|
|
// Any of "word", "segment".
|
|
TimestampGranularities []string `json:"timestamp_granularities,omitzero"`
|
|
paramObj
|
|
}
|
|
|
|
func (r AudioTranscriptionNewParams) MarshalMultipart() (data []byte, contentType string, err error) {
|
|
buf := bytes.NewBuffer(nil)
|
|
writer := multipart.NewWriter(buf)
|
|
err = apiform.MarshalRoot(r, writer)
|
|
if err == nil {
|
|
err = apiform.WriteExtras(writer, r.ExtraFields())
|
|
}
|
|
if err != nil {
|
|
writer.Close()
|
|
return nil, "", err
|
|
}
|
|
err = writer.Close()
|
|
if err != nil {
|
|
return nil, "", err
|
|
}
|
|
return buf.Bytes(), writer.FormDataContentType(), nil
|
|
}
|
|
|
|
// Only one field can be non-zero.
|
|
//
|
|
// Use [param.IsOmitted] to confirm if a field is set.
|
|
type AudioTranscriptionNewParamsChunkingStrategyUnion struct {
|
|
// Construct this variant with constant.ValueOf[constant.Auto]()
|
|
OfAuto constant.Auto `json:",omitzero,inline"`
|
|
OfAudioTranscriptionNewsChunkingStrategyVadConfig *AudioTranscriptionNewParamsChunkingStrategyVadConfig `json:",omitzero,inline"`
|
|
paramUnion
|
|
}
|
|
|
|
func (u AudioTranscriptionNewParamsChunkingStrategyUnion) MarshalJSON() ([]byte, error) {
|
|
return param.MarshalUnion(u, u.OfAuto, u.OfAudioTranscriptionNewsChunkingStrategyVadConfig)
|
|
}
|
|
func (u *AudioTranscriptionNewParamsChunkingStrategyUnion) UnmarshalJSON(data []byte) error {
|
|
return apijson.UnmarshalRoot(data, u)
|
|
}
|
|
|
|
func (u *AudioTranscriptionNewParamsChunkingStrategyUnion) asAny() any {
|
|
if !param.IsOmitted(u.OfAuto) {
|
|
return &u.OfAuto
|
|
} else if !param.IsOmitted(u.OfAudioTranscriptionNewsChunkingStrategyVadConfig) {
|
|
return u.OfAudioTranscriptionNewsChunkingStrategyVadConfig
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// The property Type is required.
|
|
type AudioTranscriptionNewParamsChunkingStrategyVadConfig struct {
|
|
// Must be set to `server_vad` to enable manual chunking using server side VAD.
|
|
//
|
|
// Any of "server_vad".
|
|
Type string `json:"type,omitzero" api:"required"`
|
|
// Amount of audio to include before the VAD detected speech (in milliseconds).
|
|
PrefixPaddingMs param.Opt[int64] `json:"prefix_padding_ms,omitzero"`
|
|
// Duration of silence to detect speech stop (in milliseconds). With shorter values
|
|
// the model will respond more quickly, but may jump in on short pauses from the
|
|
// user.
|
|
SilenceDurationMs param.Opt[int64] `json:"silence_duration_ms,omitzero"`
|
|
// Sensitivity threshold (0.0 to 1.0) for voice activity detection. A higher
|
|
// threshold will require louder audio to activate the model, and thus might
|
|
// perform better in noisy environments.
|
|
Threshold param.Opt[float64] `json:"threshold,omitzero"`
|
|
paramObj
|
|
}
|
|
|
|
func (r AudioTranscriptionNewParamsChunkingStrategyVadConfig) MarshalJSON() (data []byte, err error) {
|
|
type shadow AudioTranscriptionNewParamsChunkingStrategyVadConfig
|
|
return param.MarshalObject(r, (*shadow)(&r))
|
|
}
|
|
func (r *AudioTranscriptionNewParamsChunkingStrategyVadConfig) UnmarshalJSON(data []byte) error {
|
|
return apijson.UnmarshalRoot(data, r)
|
|
}
|
|
|
|
func init() {
|
|
apijson.RegisterFieldValidator[AudioTranscriptionNewParamsChunkingStrategyVadConfig](
|
|
"type", "server_vad",
|
|
)
|
|
}
|