From 5c40cf33d0e08ad7be939a8a106351f085bfa18d Mon Sep 17 00:00:00 2001
From: Fedor Zhdanov <zhdanovfv@gmail.com>
Date: Tue, 23 Apr 2024 22:45:40 -0700
Subject: [PATCH] added support for verbose_json for audio transcriptions
 (words and timestamps)

---
 .../Models/AudioTranscriptionQuery.swift      | 31 ++++++---
 .../Models/AudioTranscriptionResult.swift     | 63 ++++++++++++++++++-
 2 files changed, 83 insertions(+), 11 deletions(-)

diff --git a/Sources/OpenAI/Public/Models/AudioTranscriptionQuery.swift b/Sources/OpenAI/Public/Models/AudioTranscriptionQuery.swift
index d1449d4a..56bbfbf2 100644
--- a/Sources/OpenAI/Public/Models/AudioTranscriptionQuery.swift
+++ b/Sources/OpenAI/Public/Models/AudioTranscriptionQuery.swift
@@ -8,14 +8,19 @@
 import Foundation
 
 public struct AudioTranscriptionQuery: Codable {
+    
+    public enum TimestampGranularities: String, Codable, Equatable, CaseIterable {
+        case word
+        case segment
+    }
 
-public enum ResponseFormat: String, Codable, Equatable, CaseIterable {
-    case json
-    case text
-    case verboseJson = "verbose_json"
-    case srt
-    case vtt
-}
+    public enum ResponseFormat: String, Codable, Equatable, CaseIterable {
+        case json
+        case text
+        case verboseJson = "verbose_json"
+        case srt
+        case vtt
+    }
 
     /// The audio file object (not file name) to transcribe, in one of these formats: flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
     public let file: Data
@@ -33,8 +38,11 @@ public enum ResponseFormat: String, Codable, Equatable, CaseIterable {
     /// The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.
     /// https://platform.openai.com/docs/guides/speech-to-text/prompting
     public let language: String?
+    /// The timestamp granularities to populate for this transcription. response_format must be set verbose_json to use timestamp granularities. Either or both of these options are supported: word, or segment. Note: There is no additional latency for segment timestamps, but generating word timestamps incurs additional latency.
+    /// Defaults to segment
+    public let timestampGranularities: [Self.TimestampGranularities]
 
-    public init(file: Data, fileType: Self.FileType, model: Model, prompt: String? = nil, temperature: Double? = nil, language: String? = nil, responseFormat: Self.ResponseFormat? = nil) {
+    public init(file: Data, fileType: Self.FileType, model: Model, prompt: String? = nil, temperature: Double? = nil, language: String? = nil, responseFormat: Self.ResponseFormat? = nil, timestampGranularities: [Self.TimestampGranularities] = []) {
         self.file = file
         self.fileType = fileType
         self.model = model
@@ -42,6 +50,7 @@ public enum ResponseFormat: String, Codable, Equatable, CaseIterable {
         self.temperature = temperature
         self.language = language
         self.responseFormat = responseFormat
+        self.timestampGranularities = timestampGranularities
     }
 
     public enum FileType: String, Codable, Equatable, CaseIterable {
@@ -88,8 +97,10 @@ extension AudioTranscriptionQuery: MultipartFormDataBodyEncodable {
             .string(paramName: "prompt", value: prompt),
             .string(paramName: "temperature", value: temperature),
             .string(paramName: "language", value: language),
-            .string(paramName: "response_format", value: responseFormat)
-        ])
+            .string(paramName: "response_format", value: responseFormat?.rawValue),
+            ] + timestampGranularities.map({.string(paramName: "timestamp_granularities[]", value: $0)})
+        )
+        
         return bodyBuilder.build()
     }
 }
diff --git a/Sources/OpenAI/Public/Models/AudioTranscriptionResult.swift b/Sources/OpenAI/Public/Models/AudioTranscriptionResult.swift
index b1c96f56..f6ef8f47 100644
--- a/Sources/OpenAI/Public/Models/AudioTranscriptionResult.swift
+++ b/Sources/OpenAI/Public/Models/AudioTranscriptionResult.swift
@@ -8,7 +8,68 @@
 import Foundation
 
 public struct AudioTranscriptionResult: Codable, Equatable {
-
+    
+    public struct Word: Codable, Equatable {
+        /// The text content of the word.
+        public let word: String
+        /// Start time of the word in seconds.
+        public let start: Float
+        /// End time of the word in seconds.
+        public let end: Float
+    }
+    
+    public struct Segment: Codable, Equatable {
+        /// Unique identifier of the segment.
+        public let id: Int
+        /// Seek offset of the segment.
+        public let seek: Int
+        /// Start time of the segment in seconds.
+        public let start: Float
+        /// End time of the segment in seconds.
+        public let end: Float
+        /// Text content of the segment.
+        public let text: String
+        /// Array of token IDs for the text content.
+        public let tokens: [Int]
+        /// Temperature parameter used for generating the segment.
+        public let temperature: Float
+        /// Average logprob of the segment. If the value is lower than -1, consider the logprobs failed.
+        public let avgLogprob: Float
+        /// Compression ratio of the segment. If the value is greater than 2.4, consider the compression failed.
+        public let compressionRatio: Float
+        /// Probability of no speech in the segment. If the value is higher than 1.0 and the avg_logprob is below -1, consider this segment silent.
+        public let noSpeechProb: Float
+        
+        enum CodingKeys: String, CodingKey {
+            case id
+            case seek
+            case start
+            case end
+            case text
+            case tokens
+            case temperature
+            case avgLogprob = "avg_logprob"
+            case compressionRatio = "compression_ratio"
+            case noSpeechProb = "no_speech_prob"
+        }
+    }
+    
     /// The transcribed text.
     public let text: String
+    
+    
+    public let task: String?
+    
+    /// The language of the input audio.
+    public let language: String?
+    
+    /// The duration of the input audio.
+    public let duration: Float?
+    
+    /// Extracted words and their corresponding timestamps.
+    public let words: [Word]?
+    
+    /// Segments of the transcribed text and their corresponding details.
+    public let segments: [Segment]?
+    
 }