openai.openai-go/audiotranscription.go

// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.

package openai

import (
	"bytes"
	"context"
	"encoding/json"
	"io"
	"mime/multipart"
	"net/http"
	"slices"

	"github.com/openai/openai-go/v3/internal/apiform"
	"github.com/openai/openai-go/v3/internal/apijson"
	"github.com/openai/openai-go/v3/internal/requestconfig"
	"github.com/openai/openai-go/v3/option"
	"github.com/openai/openai-go/v3/packages/param"
	"github.com/openai/openai-go/v3/packages/respjson"
	"github.com/openai/openai-go/v3/packages/ssestream"
	"github.com/openai/openai-go/v3/shared/constant"
)

// Turn audio into text or text into audio.
//
// AudioTranscriptionService contains methods and other services that help with
// interacting with the openai API.
//
// Note, unlike clients, this service does not read variables from the environment
// automatically. You should not instantiate this service directly, and instead use
// the [NewAudioTranscriptionService] method instead.
type AudioTranscriptionService struct {
	Options []option.RequestOption
}

// NewAudioTranscriptionService generates a new service that applies the given
// options to each request. These options are applied after the parent client's
// options (if there is one), and before any request-specific options.
func NewAudioTranscriptionService(opts ...option.RequestOption) (r AudioTranscriptionService) {
	r = AudioTranscriptionService{}
	r.Options = opts
	return
}

// Transcribes audio into the input language.
//
// Returns a transcription object in `json`, `diarized_json`, or `verbose_json`
// format, or a stream of transcript events.
func (r *AudioTranscriptionService) New(ctx context.Context, body AudioTranscriptionNewParams, opts ...option.RequestOption) (res *AudioTranscriptionNewResponseUnion, err error) {
	opts = slices.Concat(r.Options, opts)
	path := "audio/transcriptions"
	err = requestconfig.ExecuteNewRequest(ctx, http.MethodPost, path, body, &res, opts...)
	return res, err
}

// Transcribes audio into the input language.
//
// Returns a transcription object in `json`, `diarized_json`, or `verbose_json`
// format, or a stream of transcript events.
func (r *AudioTranscriptionService) NewStreaming(ctx context.Context, body AudioTranscriptionNewParams, opts ...option.RequestOption) (stream *ssestream.Stream[TranscriptionStreamEventUnion]) {
	var (
		raw *http.Response
		err error
	)
	opts = slices.Concat(r.Options, opts)
	body.SetExtraFields(map[string]any{
		"stream": "true",
	})
	path := "audio/transcriptions"
	err = requestconfig.ExecuteNewRequest(ctx, http.MethodPost, path, body, &raw, opts...)
	return ssestream.NewStream[TranscriptionStreamEventUnion](ssestream.NewDecoder(raw), err)
}

// Represents a transcription response returned by model, based on the provided
// input.
type Transcription struct {
	// The transcribed text.
	Text string `json:"text" api:"required"`
	// The log probabilities of the tokens in the transcription. Only returned with the
	// models `gpt-4o-transcribe` and `gpt-4o-mini-transcribe` if `logprobs` is added
	// to the `include` array.
	Logprobs []TranscriptionLogprob `json:"logprobs"`
	// Token usage statistics for the request.
	Usage TranscriptionUsageUnion `json:"usage"`
	// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
	JSON struct {
		Text        respjson.Field
		Logprobs    respjson.Field
		Usage       respjson.Field
		ExtraFields map[string]respjson.Field
		raw         string
	} `json:"-"`
}

// Returns the unmodified JSON received from the API
func (r Transcription) RawJSON() string { return r.JSON.raw }
func (r *Transcription) UnmarshalJSON(data []byte) error {
	return apijson.UnmarshalRoot(data, r)
}

type TranscriptionLogprob struct {
	// The token in the transcription.
	Token string `json:"token"`
	// The bytes of the token.
	Bytes []float64 `json:"bytes"`
	// The log probability of the token.
	Logprob float64 `json:"logprob"`
	// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
	JSON struct {
		Token       respjson.Field
		Bytes       respjson.Field
		Logprob     respjson.Field
		ExtraFields map[string]respjson.Field
		raw         string
	} `json:"-"`
}

// Returns the unmodified JSON received from the API
func (r TranscriptionLogprob) RawJSON() string { return r.JSON.raw }
func (r *TranscriptionLogprob) UnmarshalJSON(data []byte) error {
	return apijson.UnmarshalRoot(data, r)
}

// TranscriptionUsageUnion contains all possible properties and values from
// [TranscriptionUsageTokens], [TranscriptionUsageDuration].
//
// Use the [TranscriptionUsageUnion.AsAny] method to switch on the variant.
//
// Use the methods beginning with 'As' to cast the union to one of its variants.
type TranscriptionUsageUnion struct {
	// This field is from variant [TranscriptionUsageTokens].
	InputTokens int64 `json:"input_tokens"`
	// This field is from variant [TranscriptionUsageTokens].
	OutputTokens int64 `json:"output_tokens"`
	// This field is from variant [TranscriptionUsageTokens].
	TotalTokens int64 `json:"total_tokens"`
	// Any of "tokens", "duration".
	Type string `json:"type"`
	// This field is from variant [TranscriptionUsageTokens].
	InputTokenDetails TranscriptionUsageTokensInputTokenDetails `json:"input_token_details"`
	// This field is from variant [TranscriptionUsageDuration].
	Seconds float64 `json:"seconds"`
	JSON    struct {
		InputTokens       respjson.Field
		OutputTokens      respjson.Field
		TotalTokens       respjson.Field
		Type              respjson.Field
		InputTokenDetails respjson.Field
		Seconds           respjson.Field
		raw               string
	} `json:"-"`
}

// anyTranscriptionUsage is implemented by each variant of
// [TranscriptionUsageUnion] to add type safety for the return type of
// [TranscriptionUsageUnion.AsAny]
type anyTranscriptionUsage interface {
	implTranscriptionUsageUnion()
}

func (TranscriptionUsageTokens) implTranscriptionUsageUnion()   {}
func (TranscriptionUsageDuration) implTranscriptionUsageUnion() {}

// Use the following switch statement to find the correct variant
//
//	switch variant := TranscriptionUsageUnion.AsAny().(type) {
//	case openai.TranscriptionUsageTokens:
//	case openai.TranscriptionUsageDuration:
//	default:
//	  fmt.Errorf("no variant present")
//	}
func (u TranscriptionUsageUnion) AsAny() anyTranscriptionUsage {
	switch u.Type {
	case "tokens":
		return u.AsTokens()
	case "duration":
		return u.AsDuration()
	}
	return nil
}

func (u TranscriptionUsageUnion) AsTokens() (v TranscriptionUsageTokens) {
	apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v)
	return
}

func (u TranscriptionUsageUnion) AsDuration() (v TranscriptionUsageDuration) {
	apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v)
	return
}

// Returns the unmodified JSON received from the API
func (u TranscriptionUsageUnion) RawJSON() string { return u.JSON.raw }

func (r *TranscriptionUsageUnion) UnmarshalJSON(data []byte) error {
	return apijson.UnmarshalRoot(data, r)
}

// Usage statistics for models billed by token usage.
type TranscriptionUsageTokens struct {
	// Number of input tokens billed for this request.
	InputTokens int64 `json:"input_tokens" api:"required"`
	// Number of output tokens generated.
	OutputTokens int64 `json:"output_tokens" api:"required"`
	// Total number of tokens used (input + output).
	TotalTokens int64 `json:"total_tokens" api:"required"`
	// The type of the usage object. Always `tokens` for this variant.
	Type constant.Tokens `json:"type" default:"tokens"`
	// Details about the input tokens billed for this request.
	InputTokenDetails TranscriptionUsageTokensInputTokenDetails `json:"input_token_details"`
	// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
	JSON struct {
		InputTokens       respjson.Field
		OutputTokens      respjson.Field
		TotalTokens       respjson.Field
		Type              respjson.Field
		InputTokenDetails respjson.Field
		ExtraFields       map[string]respjson.Field
		raw               string
	} `json:"-"`
}

// Returns the unmodified JSON received from the API
func (r TranscriptionUsageTokens) RawJSON() string { return r.JSON.raw }
func (r *TranscriptionUsageTokens) UnmarshalJSON(data []byte) error {
	return apijson.UnmarshalRoot(data, r)
}

// Details about the input tokens billed for this request.
type TranscriptionUsageTokensInputTokenDetails struct {
	// Number of audio tokens billed for this request.
	AudioTokens int64 `json:"audio_tokens"`
	// Number of text tokens billed for this request.
	TextTokens int64 `json:"text_tokens"`
	// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
	JSON struct {
		AudioTokens respjson.Field
		TextTokens  respjson.Field
		ExtraFields map[string]respjson.Field
		raw         string
	} `json:"-"`
}

// Returns the unmodified JSON received from the API
func (r TranscriptionUsageTokensInputTokenDetails) RawJSON() string { return r.JSON.raw }
func (r *TranscriptionUsageTokensInputTokenDetails) UnmarshalJSON(data []byte) error {
	return apijson.UnmarshalRoot(data, r)
}

// Usage statistics for models billed by audio input duration.
type TranscriptionUsageDuration struct {
	// Duration of the input audio in seconds.
	Seconds float64 `json:"seconds" api:"required"`
	// The type of the usage object. Always `duration` for this variant.
	Type constant.Duration `json:"type" default:"duration"`
	// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
	JSON struct {
		Seconds     respjson.Field
		Type        respjson.Field
		ExtraFields map[string]respjson.Field
		raw         string
	} `json:"-"`
}

// Returns the unmodified JSON received from the API
func (r TranscriptionUsageDuration) RawJSON() string { return r.JSON.raw }
func (r *TranscriptionUsageDuration) UnmarshalJSON(data []byte) error {
	return apijson.UnmarshalRoot(data, r)
}

type TranscriptionInclude string

const (
	TranscriptionIncludeLogprobs TranscriptionInclude = "logprobs"
)

type TranscriptionSegment struct {
	// Unique identifier of the segment.
	ID int64 `json:"id" api:"required"`
	// Average logprob of the segment. If the value is lower than -1, consider the
	// logprobs failed.
	AvgLogprob float64 `json:"avg_logprob" api:"required"`
	// Compression ratio of the segment. If the value is greater than 2.4, consider the
	// compression failed.
	CompressionRatio float64 `json:"compression_ratio" api:"required"`
	// End time of the segment in seconds.
	End float64 `json:"end" api:"required"`
	// Probability of no speech in the segment. If the value is higher than 1.0 and the
	// `avg_logprob` is below -1, consider this segment silent.
	NoSpeechProb float64 `json:"no_speech_prob" api:"required"`
	// Seek offset of the segment.
	Seek int64 `json:"seek" api:"required"`
	// Start time of the segment in seconds.
	Start float64 `json:"start" api:"required"`
	// Temperature parameter used for generating the segment.
	Temperature float64 `json:"temperature" api:"required"`
	// Text content of the segment.
	Text string `json:"text" api:"required"`
	// Array of token IDs for the text content.
	Tokens []int64 `json:"tokens" api:"required"`
	// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
	JSON struct {
		ID               respjson.Field
		AvgLogprob       respjson.Field
		CompressionRatio respjson.Field
		End              respjson.Field
		NoSpeechProb     respjson.Field
		Seek             respjson.Field
		Start            respjson.Field
		Temperature      respjson.Field
		Text             respjson.Field
		Tokens           respjson.Field
		ExtraFields      map[string]respjson.Field
		raw              string
	} `json:"-"`
}

// Returns the unmodified JSON received from the API
func (r TranscriptionSegment) RawJSON() string { return r.JSON.raw }
func (r *TranscriptionSegment) UnmarshalJSON(data []byte) error {
	return apijson.UnmarshalRoot(data, r)
}

// TranscriptionStreamEventUnion contains all possible properties and values from
// [TranscriptionTextSegmentEvent], [TranscriptionTextDeltaEvent],
// [TranscriptionTextDoneEvent].
//
// Use the [TranscriptionStreamEventUnion.AsAny] method to switch on the variant.
//
// Use the methods beginning with 'As' to cast the union to one of its variants.
type TranscriptionStreamEventUnion struct {
	// This field is from variant [TranscriptionTextSegmentEvent].
	ID string `json:"id"`
	// This field is from variant [TranscriptionTextSegmentEvent].
	End float64 `json:"end"`
	// This field is from variant [TranscriptionTextSegmentEvent].
	Speaker string `json:"speaker"`
	// This field is from variant [TranscriptionTextSegmentEvent].
	Start float64 `json:"start"`
	Text  string  `json:"text"`
	// Any of "transcript.text.segment", "transcript.text.delta",
	// "transcript.text.done".
	Type string `json:"type"`
	// This field is from variant [TranscriptionTextDeltaEvent].
	Delta string `json:"delta"`
	// This field is a union of [[]TranscriptionTextDeltaEventLogprob],
	// [[]TranscriptionTextDoneEventLogprob]
	Logprobs TranscriptionStreamEventUnionLogprobs `json:"logprobs"`
	// This field is from variant [TranscriptionTextDeltaEvent].
	SegmentID string `json:"segment_id"`
	// This field is from variant [TranscriptionTextDoneEvent].
	Usage TranscriptionTextDoneEventUsage `json:"usage"`
	JSON  struct {
		ID        respjson.Field
		End       respjson.Field
		Speaker   respjson.Field
		Start     respjson.Field
		Text      respjson.Field
		Type      respjson.Field
		Delta     respjson.Field
		Logprobs  respjson.Field
		SegmentID respjson.Field
		Usage     respjson.Field
		raw       string
	} `json:"-"`
}

// anyTranscriptionStreamEvent is implemented by each variant of
// [TranscriptionStreamEventUnion] to add type safety for the return type of
// [TranscriptionStreamEventUnion.AsAny]
type anyTranscriptionStreamEvent interface {
	implTranscriptionStreamEventUnion()
}

func (TranscriptionTextSegmentEvent) implTranscriptionStreamEventUnion() {}
func (TranscriptionTextDeltaEvent) implTranscriptionStreamEventUnion()   {}
func (TranscriptionTextDoneEvent) implTranscriptionStreamEventUnion()    {}

// Use the following switch statement to find the correct variant
//
//	switch variant := TranscriptionStreamEventUnion.AsAny().(type) {
//	case openai.TranscriptionTextSegmentEvent:
//	case openai.TranscriptionTextDeltaEvent:
//	case openai.TranscriptionTextDoneEvent:
//	default:
//	  fmt.Errorf("no variant present")
//	}
func (u TranscriptionStreamEventUnion) AsAny() anyTranscriptionStreamEvent {
	switch u.Type {
	case "transcript.text.segment":
		return u.AsTranscriptTextSegment()
	case "transcript.text.delta":
		return u.AsTranscriptTextDelta()
	case "transcript.text.done":
		return u.AsTranscriptTextDone()
	}
	return nil
}

func (u TranscriptionStreamEventUnion) AsTranscriptTextSegment() (v TranscriptionTextSegmentEvent) {
	apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v)
	return
}

func (u TranscriptionStreamEventUnion) AsTranscriptTextDelta() (v TranscriptionTextDeltaEvent) {
	apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v)
	return
}

func (u TranscriptionStreamEventUnion) AsTranscriptTextDone() (v TranscriptionTextDoneEvent) {
	apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v)
	return
}

// Returns the unmodified JSON received from the API
func (u TranscriptionStreamEventUnion) RawJSON() string { return u.JSON.raw }

func (r *TranscriptionStreamEventUnion) UnmarshalJSON(data []byte) error {
	return apijson.UnmarshalRoot(data, r)
}

// TranscriptionStreamEventUnionLogprobs is an implicit subunion of
// [TranscriptionStreamEventUnion]. TranscriptionStreamEventUnionLogprobs provides
// convenient access to the sub-properties of the union.
//
// For type safety it is recommended to directly use a variant of the
// [TranscriptionStreamEventUnion].
//
// If the underlying value is not a json object, one of the following properties
// will be valid: OfTranscriptionTextDeltaEventLogprobs
// OfTranscriptionTextDoneEventLogprobs]
type TranscriptionStreamEventUnionLogprobs struct {
	// This field will be present if the value is a
	// [[]TranscriptionTextDeltaEventLogprob] instead of an object.
	OfTranscriptionTextDeltaEventLogprobs []TranscriptionTextDeltaEventLogprob `json:",inline"`
	// This field will be present if the value is a
	// [[]TranscriptionTextDoneEventLogprob] instead of an object.
	OfTranscriptionTextDoneEventLogprobs []TranscriptionTextDoneEventLogprob `json:",inline"`
	JSON                                 struct {
		OfTranscriptionTextDeltaEventLogprobs respjson.Field
		OfTranscriptionTextDoneEventLogprobs  respjson.Field
		raw                                   string
	} `json:"-"`
}

func (r *TranscriptionStreamEventUnionLogprobs) UnmarshalJSON(data []byte) error {
	return apijson.UnmarshalRoot(data, r)
}

// Emitted when there is an additional text delta. This is also the first event
// emitted when the transcription starts. Only emitted when you
// [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
// with the `Stream` parameter set to `true`.
type TranscriptionTextDeltaEvent struct {
	// The text delta that was additionally transcribed.
	Delta string `json:"delta" api:"required"`
	// The type of the event. Always `transcript.text.delta`.
	Type constant.TranscriptTextDelta `json:"type" default:"transcript.text.delta"`
	// The log probabilities of the delta. Only included if you
	// [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
	// with the `include[]` parameter set to `logprobs`.
	Logprobs []TranscriptionTextDeltaEventLogprob `json:"logprobs"`
	// Identifier of the diarized segment that this delta belongs to. Only present when
	// using `gpt-4o-transcribe-diarize`.
	SegmentID string `json:"segment_id"`
	// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
	JSON struct {
		Delta       respjson.Field
		Type        respjson.Field
		Logprobs    respjson.Field
		SegmentID   respjson.Field
		ExtraFields map[string]respjson.Field
		raw         string
	} `json:"-"`
}

// Returns the unmodified JSON received from the API
func (r TranscriptionTextDeltaEvent) RawJSON() string { return r.JSON.raw }
func (r *TranscriptionTextDeltaEvent) UnmarshalJSON(data []byte) error {
	return apijson.UnmarshalRoot(data, r)
}

type TranscriptionTextDeltaEventLogprob struct {
	// The token that was used to generate the log probability.
	Token string `json:"token"`
	// The bytes that were used to generate the log probability.
	Bytes []int64 `json:"bytes"`
	// The log probability of the token.
	Logprob float64 `json:"logprob"`
	// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
	JSON struct {
		Token       respjson.Field
		Bytes       respjson.Field
		Logprob     respjson.Field
		ExtraFields map[string]respjson.Field
		raw         string
	} `json:"-"`
}

// Returns the unmodified JSON received from the API
func (r TranscriptionTextDeltaEventLogprob) RawJSON() string { return r.JSON.raw }
func (r *TranscriptionTextDeltaEventLogprob) UnmarshalJSON(data []byte) error {
	return apijson.UnmarshalRoot(data, r)
}

// Emitted when the transcription is complete. Contains the complete transcription
// text. Only emitted when you
// [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
// with the `Stream` parameter set to `true`.
type TranscriptionTextDoneEvent struct {
	// The text that was transcribed.
	Text string `json:"text" api:"required"`
	// The type of the event. Always `transcript.text.done`.
	Type constant.TranscriptTextDone `json:"type" default:"transcript.text.done"`
	// The log probabilities of the individual tokens in the transcription. Only
	// included if you
	// [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
	// with the `include[]` parameter set to `logprobs`.
	Logprobs []TranscriptionTextDoneEventLogprob `json:"logprobs"`
	// Usage statistics for models billed by token usage.
	Usage TranscriptionTextDoneEventUsage `json:"usage"`
	// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
	JSON struct {
		Text        respjson.Field
		Type        respjson.Field
		Logprobs    respjson.Field
		Usage       respjson.Field
		ExtraFields map[string]respjson.Field
		raw         string
	} `json:"-"`
}

// Returns the unmodified JSON received from the API
func (r TranscriptionTextDoneEvent) RawJSON() string { return r.JSON.raw }
func (r *TranscriptionTextDoneEvent) UnmarshalJSON(data []byte) error {
	return apijson.UnmarshalRoot(data, r)
}

type TranscriptionTextDoneEventLogprob struct {
	// The token that was used to generate the log probability.
	Token string `json:"token"`
	// The bytes that were used to generate the log probability.
	Bytes []int64 `json:"bytes"`
	// The log probability of the token.
	Logprob float64 `json:"logprob"`
	// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
	JSON struct {
		Token       respjson.Field
		Bytes       respjson.Field
		Logprob     respjson.Field
		ExtraFields map[string]respjson.Field
		raw         string
	} `json:"-"`
}

// Returns the unmodified JSON received from the API
func (r TranscriptionTextDoneEventLogprob) RawJSON() string { return r.JSON.raw }
func (r *TranscriptionTextDoneEventLogprob) UnmarshalJSON(data []byte) error {
	return apijson.UnmarshalRoot(data, r)
}

// Usage statistics for models billed by token usage.
type TranscriptionTextDoneEventUsage struct {
	// Number of input tokens billed for this request.
	InputTokens int64 `json:"input_tokens" api:"required"`
	// Number of output tokens generated.
	OutputTokens int64 `json:"output_tokens" api:"required"`
	// Total number of tokens used (input + output).
	TotalTokens int64 `json:"total_tokens" api:"required"`
	// The type of the usage object. Always `tokens` for this variant.
	Type constant.Tokens `json:"type" default:"tokens"`
	// Details about the input tokens billed for this request.
	InputTokenDetails TranscriptionTextDoneEventUsageInputTokenDetails `json:"input_token_details"`
	// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
	JSON struct {
		InputTokens       respjson.Field
		OutputTokens      respjson.Field
		TotalTokens       respjson.Field
		Type              respjson.Field
		InputTokenDetails respjson.Field
		ExtraFields       map[string]respjson.Field
		raw               string
	} `json:"-"`
}

// Returns the unmodified JSON received from the API
func (r TranscriptionTextDoneEventUsage) RawJSON() string { return r.JSON.raw }
func (r *TranscriptionTextDoneEventUsage) UnmarshalJSON(data []byte) error {
	return apijson.UnmarshalRoot(data, r)
}

// Details about the input tokens billed for this request.
type TranscriptionTextDoneEventUsageInputTokenDetails struct {
	// Number of audio tokens billed for this request.
	AudioTokens int64 `json:"audio_tokens"`
	// Number of text tokens billed for this request.
	TextTokens int64 `json:"text_tokens"`
	// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
	JSON struct {
		AudioTokens respjson.Field
		TextTokens  respjson.Field
		ExtraFields map[string]respjson.Field
		raw         string
	} `json:"-"`
}

// Returns the unmodified JSON received from the API
func (r TranscriptionTextDoneEventUsageInputTokenDetails) RawJSON() string { return r.JSON.raw }
func (r *TranscriptionTextDoneEventUsageInputTokenDetails) UnmarshalJSON(data []byte) error {
	return apijson.UnmarshalRoot(data, r)
}

// Emitted when a diarized transcription returns a completed segment with speaker
// information. Only emitted when you
// [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
// with `stream` set to `true` and `response_format` set to `diarized_json`.
type TranscriptionTextSegmentEvent struct {
	// Unique identifier for the segment.
	ID string `json:"id" api:"required"`
	// End timestamp of the segment in seconds.
	End float64 `json:"end" api:"required"`
	// Speaker label for this segment.
	Speaker string `json:"speaker" api:"required"`
	// Start timestamp of the segment in seconds.
	Start float64 `json:"start" api:"required"`
	// Transcript text for this segment.
	Text string `json:"text" api:"required"`
	// The type of the event. Always `transcript.text.segment`.
	Type constant.TranscriptTextSegment `json:"type" default:"transcript.text.segment"`
	// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
	JSON struct {
		ID          respjson.Field
		End         respjson.Field
		Speaker     respjson.Field
		Start       respjson.Field
		Text        respjson.Field
		Type        respjson.Field
		ExtraFields map[string]respjson.Field
		raw         string
	} `json:"-"`
}

// Returns the unmodified JSON received from the API
func (r TranscriptionTextSegmentEvent) RawJSON() string { return r.JSON.raw }
func (r *TranscriptionTextSegmentEvent) UnmarshalJSON(data []byte) error {
	return apijson.UnmarshalRoot(data, r)
}

// Represents a verbose json transcription response returned by model, based on the
// provided input.
type TranscriptionVerbose struct {
	// The duration of the input audio.
	Duration float64 `json:"duration" api:"required"`
	// The language of the input audio.
	Language string `json:"language" api:"required"`
	// The transcribed text.
	Text string `json:"text" api:"required"`
	// Segments of the transcribed text and their corresponding details.
	Segments []TranscriptionSegment `json:"segments"`
	// Usage statistics for models billed by audio input duration.
	Usage TranscriptionVerboseUsage `json:"usage"`
	// Extracted words and their corresponding timestamps.
	Words []TranscriptionWord `json:"words"`
	// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
	JSON struct {
		Duration    respjson.Field
		Language    respjson.Field
		Text        respjson.Field
		Segments    respjson.Field
		Usage       respjson.Field
		Words       respjson.Field
		ExtraFields map[string]respjson.Field
		raw         string
	} `json:"-"`
}

// Returns the unmodified JSON received from the API
func (r TranscriptionVerbose) RawJSON() string { return r.JSON.raw }
func (r *TranscriptionVerbose) UnmarshalJSON(data []byte) error {
	return apijson.UnmarshalRoot(data, r)
}

// Usage statistics for models billed by audio input duration.
type TranscriptionVerboseUsage struct {
	// Duration of the input audio in seconds.
	Seconds float64 `json:"seconds" api:"required"`
	// The type of the usage object. Always `duration` for this variant.
	Type constant.Duration `json:"type" default:"duration"`
	// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
	JSON struct {
		Seconds     respjson.Field
		Type        respjson.Field
		ExtraFields map[string]respjson.Field
		raw         string
	} `json:"-"`
}

// Returns the unmodified JSON received from the API
func (r TranscriptionVerboseUsage) RawJSON() string { return r.JSON.raw }
func (r *TranscriptionVerboseUsage) UnmarshalJSON(data []byte) error {
	return apijson.UnmarshalRoot(data, r)
}

type TranscriptionWord struct {
	// End time of the word in seconds.
	End float64 `json:"end" api:"required"`
	// Start time of the word in seconds.
	Start float64 `json:"start" api:"required"`
	// The text content of the word.
	Word string `json:"word" api:"required"`
	// JSON contains metadata for fields, check presence with [respjson.Field.Valid].
	JSON struct {
		End         respjson.Field
		Start       respjson.Field
		Word        respjson.Field
		ExtraFields map[string]respjson.Field
		raw         string
	} `json:"-"`
}

// Returns the unmodified JSON received from the API
func (r TranscriptionWord) RawJSON() string { return r.JSON.raw }
func (r *TranscriptionWord) UnmarshalJSON(data []byte) error {
	return apijson.UnmarshalRoot(data, r)
}

// AudioTranscriptionNewResponseUnion contains all possible properties and values
// from [Transcription], [TranscriptionVerbose].
//
// Use the methods beginning with 'As' to cast the union to one of its variants.
type AudioTranscriptionNewResponseUnion struct {
	Text string `json:"text"`
	// This field is from variant [Transcription].
	Logprobs []TranscriptionLogprob `json:"logprobs"`
	// This field is a union of [TranscriptionUsageUnion], [TranscriptionVerboseUsage]
	Usage AudioTranscriptionNewResponseUnionUsage `json:"usage"`
	// This field is from variant [TranscriptionVerbose].
	Duration float64 `json:"duration"`
	// This field is from variant [TranscriptionVerbose].
	Language string `json:"language"`
	// This field is from variant [TranscriptionVerbose].
	Segments []TranscriptionSegment `json:"segments"`
	// This field is from variant [TranscriptionVerbose].
	Words []TranscriptionWord `json:"words"`
	JSON  struct {
		Text     respjson.Field
		Logprobs respjson.Field
		Usage    respjson.Field
		Duration respjson.Field
		Language respjson.Field
		Segments respjson.Field
		Words    respjson.Field
		raw      string
	} `json:"-"`
}

func (u AudioTranscriptionNewResponseUnion) AsTranscription() (v Transcription) {
	apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v)
	return
}

func (u AudioTranscriptionNewResponseUnion) AsTranscriptionVerbose() (v TranscriptionVerbose) {
	apijson.UnmarshalRoot(json.RawMessage(u.JSON.raw), &v)
	return
}

// Returns the unmodified JSON received from the API
func (u AudioTranscriptionNewResponseUnion) RawJSON() string { return u.JSON.raw }

func (r *AudioTranscriptionNewResponseUnion) UnmarshalJSON(data []byte) error {
	return apijson.UnmarshalRoot(data, r)
}

// AudioTranscriptionNewResponseUnionUsage is an implicit subunion of
// [AudioTranscriptionNewResponseUnion]. AudioTranscriptionNewResponseUnionUsage
// provides convenient access to the sub-properties of the union.
//
// For type safety it is recommended to directly use a variant of the
// [AudioTranscriptionNewResponseUnion].
type AudioTranscriptionNewResponseUnionUsage struct {
	// This field is from variant [TranscriptionUsageUnion].
	InputTokens int64 `json:"input_tokens"`
	// This field is from variant [TranscriptionUsageUnion].
	OutputTokens int64 `json:"output_tokens"`
	// This field is from variant [TranscriptionUsageUnion].
	TotalTokens int64  `json:"total_tokens"`
	Type        string `json:"type"`
	// This field is from variant [TranscriptionUsageUnion].
	InputTokenDetails TranscriptionUsageTokensInputTokenDetails `json:"input_token_details"`
	Seconds           float64                                   `json:"seconds"`
	JSON              struct {
		InputTokens       respjson.Field
		OutputTokens      respjson.Field
		TotalTokens       respjson.Field
		Type              respjson.Field
		InputTokenDetails respjson.Field
		Seconds           respjson.Field
		raw               string
	} `json:"-"`
}

func (r *AudioTranscriptionNewResponseUnionUsage) UnmarshalJSON(data []byte) error {
	return apijson.UnmarshalRoot(data, r)
}

type AudioTranscriptionNewParams struct {
	// The audio file object (not file name) to transcribe, in one of these formats:
	// flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
	File io.Reader `json:"file,omitzero" api:"required" format:"binary"`
	// ID of the model to use. The options are `gpt-4o-transcribe`,
	// `gpt-4o-mini-transcribe`, `gpt-4o-mini-transcribe-2025-12-15`, `whisper-1`
	// (which is powered by our open source Whisper V2 model), and
	// `gpt-4o-transcribe-diarize`.
	Model AudioModel `json:"model,omitzero" api:"required"`
	// The language of the input audio. Supplying the input language in
	// [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
	// format will improve accuracy and latency.
	Language param.Opt[string] `json:"language,omitzero"`
	// An optional text to guide the model's style or continue a previous audio
	// segment. The
	// [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
	// should match the audio language. This field is not supported when using
	// `gpt-4o-transcribe-diarize`.
	Prompt param.Opt[string] `json:"prompt,omitzero"`
	// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
	// output more random, while lower values like 0.2 will make it more focused and
	// deterministic. If set to 0, the model will use
	// [log probability](https://en.wikipedia.org/wiki/Log_probability) to
	// automatically increase the temperature until certain thresholds are hit.
	Temperature param.Opt[float64] `json:"temperature,omitzero"`
	// Controls how the audio is cut into chunks. When set to `"auto"`, the server
	// first normalizes loudness and then uses voice activity detection (VAD) to choose
	// boundaries. `server_vad` object can be provided to tweak VAD detection
	// parameters manually. If unset, the audio is transcribed as a single block.
	// Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
	// seconds.
	ChunkingStrategy AudioTranscriptionNewParamsChunkingStrategyUnion `json:"chunking_strategy,omitzero"`
	// Additional information to include in the transcription response. `logprobs` will
	// return the log probabilities of the tokens in the response to understand the
	// model's confidence in the transcription. `logprobs` only works with
	// response_format set to `json` and only with the models `gpt-4o-transcribe`,
	// `gpt-4o-mini-transcribe`, and `gpt-4o-mini-transcribe-2025-12-15`. This field is
	// not supported when using `gpt-4o-transcribe-diarize`.
	Include []TranscriptionInclude `json:"include,omitzero"`
	// Optional list of speaker names that correspond to the audio samples provided in
	// `known_speaker_references[]`. Each entry should be a short identifier (for
	// example `customer` or `agent`). Up to 4 speakers are supported.
	KnownSpeakerNames []string `json:"known_speaker_names,omitzero"`
	// Optional list of audio samples (as
	// [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
	// that contain known speaker references matching `known_speaker_names[]`. Each
	// sample must be between 2 and 10 seconds, and can use any of the same input audio
	// formats supported by `file`.
	KnownSpeakerReferences []string `json:"known_speaker_references,omitzero"`
	// The format of the output, in one of these options: `json`, `text`, `srt`,
	// `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
	// `gpt-4o-mini-transcribe`, the only supported format is `json`. For
	// `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
	// `diarized_json`, with `diarized_json` required to receive speaker annotations.
	//
	// Any of "json", "text", "srt", "verbose_json", "vtt", "diarized_json".
	ResponseFormat AudioResponseFormat `json:"response_format,omitzero"`
	// The timestamp granularities to populate for this transcription.
	// `response_format` must be set `verbose_json` to use timestamp granularities.
	// Either or both of these options are supported: `word`, or `segment`. Note: There
	// is no additional latency for segment timestamps, but generating word timestamps
	// incurs additional latency. This option is not available for
	// `gpt-4o-transcribe-diarize`.
	//
	// Any of "word", "segment".
	TimestampGranularities []string `json:"timestamp_granularities,omitzero"`
	paramObj
}

func (r AudioTranscriptionNewParams) MarshalMultipart() (data []byte, contentType string, err error) {
	buf := bytes.NewBuffer(nil)
	writer := multipart.NewWriter(buf)
	err = apiform.MarshalRoot(r, writer)
	if err == nil {
		err = apiform.WriteExtras(writer, r.ExtraFields())
	}
	if err != nil {
		writer.Close()
		return nil, "", err
	}
	err = writer.Close()
	if err != nil {
		return nil, "", err
	}
	return buf.Bytes(), writer.FormDataContentType(), nil
}

// Only one field can be non-zero.
//
// Use [param.IsOmitted] to confirm if a field is set.
type AudioTranscriptionNewParamsChunkingStrategyUnion struct {
	// Construct this variant with constant.ValueOf[constant.Auto]()
	OfAuto                                            constant.Auto                                         `json:",omitzero,inline"`
	OfAudioTranscriptionNewsChunkingStrategyVadConfig *AudioTranscriptionNewParamsChunkingStrategyVadConfig `json:",omitzero,inline"`
	paramUnion
}

func (u AudioTranscriptionNewParamsChunkingStrategyUnion) MarshalJSON() ([]byte, error) {
	return param.MarshalUnion(u, u.OfAuto, u.OfAudioTranscriptionNewsChunkingStrategyVadConfig)
}
func (u *AudioTranscriptionNewParamsChunkingStrategyUnion) UnmarshalJSON(data []byte) error {
	return apijson.UnmarshalRoot(data, u)
}

func (u *AudioTranscriptionNewParamsChunkingStrategyUnion) asAny() any {
	if !param.IsOmitted(u.OfAuto) {
		return &u.OfAuto
	} else if !param.IsOmitted(u.OfAudioTranscriptionNewsChunkingStrategyVadConfig) {
		return u.OfAudioTranscriptionNewsChunkingStrategyVadConfig
	}
	return nil
}

// The property Type is required.
type AudioTranscriptionNewParamsChunkingStrategyVadConfig struct {
	// Must be set to `server_vad` to enable manual chunking using server side VAD.
	//
	// Any of "server_vad".
	Type string `json:"type,omitzero" api:"required"`
	// Amount of audio to include before the VAD detected speech (in milliseconds).
	PrefixPaddingMs param.Opt[int64] `json:"prefix_padding_ms,omitzero"`
	// Duration of silence to detect speech stop (in milliseconds). With shorter values
	// the model will respond more quickly, but may jump in on short pauses from the
	// user.
	SilenceDurationMs param.Opt[int64] `json:"silence_duration_ms,omitzero"`
	// Sensitivity threshold (0.0 to 1.0) for voice activity detection. A higher
	// threshold will require louder audio to activate the model, and thus might
	// perform better in noisy environments.
	Threshold param.Opt[float64] `json:"threshold,omitzero"`
	paramObj
}

func (r AudioTranscriptionNewParamsChunkingStrategyVadConfig) MarshalJSON() (data []byte, err error) {
	type shadow AudioTranscriptionNewParamsChunkingStrategyVadConfig
	return param.MarshalObject(r, (*shadow)(&r))
}
func (r *AudioTranscriptionNewParamsChunkingStrategyVadConfig) UnmarshalJSON(data []byte) error {
	return apijson.UnmarshalRoot(data, r)
}

func init() {
	apijson.RegisterFieldValidator[AudioTranscriptionNewParamsChunkingStrategyVadConfig](
		"type", "server_vad",
	)
}