From 5c40cf33d0e08ad7be939a8a106351f085bfa18d Mon Sep 17 00:00:00 2001 From: Fedor Zhdanov Date: Tue, 23 Apr 2024 22:45:40 -0700 Subject: [PATCH] added support for verbose_json for audio transcriptions (words and timestamps) --- .../Models/AudioTranscriptionQuery.swift | 31 ++++++--- .../Models/AudioTranscriptionResult.swift | 63 ++++++++++++++++++- 2 files changed, 83 insertions(+), 11 deletions(-) diff --git a/Sources/OpenAI/Public/Models/AudioTranscriptionQuery.swift b/Sources/OpenAI/Public/Models/AudioTranscriptionQuery.swift index d1449d4a..56bbfbf2 100644 --- a/Sources/OpenAI/Public/Models/AudioTranscriptionQuery.swift +++ b/Sources/OpenAI/Public/Models/AudioTranscriptionQuery.swift @@ -8,14 +8,19 @@ import Foundation public struct AudioTranscriptionQuery: Codable { + + public enum TimestampGranularities: String, Codable, Equatable, CaseIterable { + case word + case segment + } -public enum ResponseFormat: String, Codable, Equatable, CaseIterable { - case json - case text - case verboseJson = "verbose_json" - case srt - case vtt -} + public enum ResponseFormat: String, Codable, Equatable, CaseIterable { + case json + case text + case verboseJson = "verbose_json" + case srt + case vtt + } /// The audio file object (not file name) to transcribe, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. public let file: Data @@ -33,8 +38,11 @@ public enum ResponseFormat: String, Codable, Equatable, CaseIterable { /// The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency. /// https://platform.openai.com/docs/guides/speech-to-text/prompting public let language: String? + /// The timestamp granularities to populate for this transcription. response_format must be set verbose_json to use timestamp granularities. Either or both of these options are supported: word, or segment. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency. + /// Defaults to segment + public let timestampGranularities: [Self.TimestampGranularities] - public init(file: Data, fileType: Self.FileType, model: Model, prompt: String? = nil, temperature: Double? = nil, language: String? = nil, responseFormat: Self.ResponseFormat? = nil) { + public init(file: Data, fileType: Self.FileType, model: Model, prompt: String? = nil, temperature: Double? = nil, language: String? = nil, responseFormat: Self.ResponseFormat? = nil, timestampGranularities: [Self.TimestampGranularities] = []) { self.file = file self.fileType = fileType self.model = model @@ -42,6 +50,7 @@ public enum ResponseFormat: String, Codable, Equatable, CaseIterable { self.temperature = temperature self.language = language self.responseFormat = responseFormat + self.timestampGranularities = timestampGranularities } public enum FileType: String, Codable, Equatable, CaseIterable { @@ -88,8 +97,10 @@ extension AudioTranscriptionQuery: MultipartFormDataBodyEncodable { .string(paramName: "prompt", value: prompt), .string(paramName: "temperature", value: temperature), .string(paramName: "language", value: language), - .string(paramName: "response_format", value: responseFormat) - ]) + .string(paramName: "response_format", value: responseFormat?.rawValue), + ] + timestampGranularities.map({.string(paramName: "timestamp_granularities[]", value: $0)}) + ) + return bodyBuilder.build() } } diff --git a/Sources/OpenAI/Public/Models/AudioTranscriptionResult.swift b/Sources/OpenAI/Public/Models/AudioTranscriptionResult.swift index b1c96f56..f6ef8f47 100644 --- a/Sources/OpenAI/Public/Models/AudioTranscriptionResult.swift +++ b/Sources/OpenAI/Public/Models/AudioTranscriptionResult.swift @@ -8,7 +8,68 @@ import Foundation public struct AudioTranscriptionResult: Codable, Equatable { - + + public struct Word: Codable, Equatable { + /// The text content of the word. + public let word: String + /// Start time of the word in seconds. + public let start: Float + /// End time of the word in seconds. + public let end: Float + } + + public struct Segment: Codable, Equatable { + /// Unique identifier of the segment. + public let id: Int + /// Seek offset of the segment. + public let seek: Int + /// Start time of the segment in seconds. + public let start: Float + /// End time of the segment in seconds. + public let end: Float + /// Text content of the segment. + public let text: String + /// Array of token IDs for the text content. + public let tokens: [Int] + /// Temperature parameter used for generating the segment. + public let temperature: Float + /// Average logprob of the segment. If the value is lower than -1, consider the logprobs failed. + public let avgLogprob: Float + /// Compression ratio of the segment. If the value is greater than 2.4, consider the compression failed. + public let compressionRatio: Float + /// Probability of no speech in the segment. If the value is higher than 1.0 and the avg_logprob is below -1, consider this segment silent. + public let noSpeechProb: Float + + enum CodingKeys: String, CodingKey { + case id + case seek + case start + case end + case text + case tokens + case temperature + case avgLogprob = "avg_logprob" + case compressionRatio = "compression_ratio" + case noSpeechProb = "no_speech_prob" + } + } + /// The transcribed text. public let text: String + + + public let task: String? + + /// The language of the input audio. + public let language: String? + + /// The duration of the input audio. + public let duration: Float? + + /// Extracted words and their corresponding timestamps. + public let words: [Word]? + + /// Segments of the transcribed text and their corresponding details. + public let segments: [Segment]? + }