ml-explore · cyrilzakka · Feb 12, 2025 · Feb 13, 2025 · Feb 13, 2025 · Feb 13, 2025
diff --git a/Applications/VLMEval/ContentView.swift b/Applications/VLMEval/ContentView.swift
@@ -15,6 +15,11 @@ import SwiftUI
     typealias PlatformImage = NSImage
 #endif
 
+let videoSystemPrompt =
+    "Focus only on describing the key dramatic action or notable event occurring in this video segment. Skip general context or scene-setting details unless they are crucial to understanding the main action."
+let imageSystemPrompt =
+    "You are an image understanding model capable of describing the salient features of any image."
+
 struct ContentView: View {
     @State var prompt = ""
     @State var llm = VLMEvaluator()
@@ -28,7 +33,7 @@ struct ContentView: View {
             }
         }
     }
-    @State private var selectedVideoURL: URL? = nil {
+    @State private var selectedVideoURL: URL? {
         didSet {
             if let selectedVideoURL {
                 player = AVPlayer(url: selectedVideoURL)
@@ -61,7 +66,11 @@ struct ContentView: View {
                 }
 
                 VStack {
-                    if let selectedImage {
+                    if let player {
+                        VideoPlayer(player: player)
+                            .frame(height: 300)
+                            .cornerRadius(12)
+                    } else if let selectedImage {
                         Group {
                             #if os(iOS) || os(visionOS)
                                 Image(uiImage: selectedImage)
@@ -91,11 +100,6 @@ struct ContentView: View {
                                 EmptyView()
                             }
                         }
-                    } else if let player {
-                        VideoPlayer(player: player)
-                            .scaledToFit()
-                            .frame(maxHeight: 300)
-                            .cornerRadius(12)
                     }
 
                     HStack {
@@ -193,6 +197,7 @@ struct ContentView: View {
                         .id("bottom")
                 }
             }
+            .frame(minHeight: 200)
 
             HStack {
                 TextField("prompt", text: $prompt)
@@ -205,6 +210,9 @@ struct ContentView: View {
                     .disabled(llm.running)
             }
         }
+        .onAppear {
+            selectedVideoURL = Bundle.main.url(forResource: "test", withExtension: "mp4")!
+        }
         #if os(visionOS)
             .padding(40)
         #else
@@ -322,10 +330,10 @@ class VLMEvaluator {
 
     /// This controls which model loads. `qwen2VL2BInstruct4Bit` is one of the smaller ones, so this will fit on
     /// more devices.
-    let modelConfiguration = ModelRegistry.qwen2VL2BInstruct4Bit
+    let modelConfiguration = ModelRegistry.smolvlm
 
     /// parameters controlling the output
-    let generateParameters = MLXLMCommon.GenerateParameters(temperature: 0.6)
+    let generateParameters = MLXLMCommon.GenerateParameters(temperature: 0.7, topP: 0.9)
     let maxTokens = 800
 
     /// update the display every N tokens -- 4 looks like it updates continuously
@@ -395,23 +403,31 @@ class VLMEvaluator {
                     } else {
                         []
                     }
-                var userInput = UserInput(
-                    messages: [
-                        [
-                            "role": "user",
-                            "content": [
-                                ["type": "text", "text": prompt]
-                            ]
-                                + images.map { _ in
-                                    ["type": "image"]
-                                }
-                                + videos.map { _ in
-                                    ["type": "video"]
-                                },
-                        ]
-                    ], images: images, videos: videos)
-                userInput.processing.resize = .init(width: 448, height: 448)
 
+                // Note: the image order is different for smolvlm
+                let messages: [Message] = [
+                    [
+                        "role": "system",
+                        "content": [
+                            [
+                                "type": "text",
+                                "text": videoURL != nil ? videoSystemPrompt : imageSystemPrompt,
+                            ]
+                        ],
+                    ],
+                    [
+                        "role": "user",
+                        "content": []
+                            + images.map { _ in
+                                ["type": "image"]
+                            }
+                            + videos.map { _ in
+                                ["type": "video"]
+                            }
+                            + [["type": "text", "text": prompt]],
+                    ],
+                ]
+                let userInput = UserInput(messages: messages, images: images, videos: videos)
                 let input = try await context.processor.prepare(input: userInput)
 
                 return try MLXLMCommon.generate(

diff --git a/Applications/VLMEval/test.mp4 b/Applications/VLMEval/test.mp4
diff --git a/Libraries/MLXVLM/MediaProcessing.swift b/Libraries/MLXVLM/MediaProcessing.swift
@@ -5,6 +5,13 @@ import CoreImage.CIFilterBuiltins
 import MLX
 import MLXLMCommon
 
+public struct VideoFrameResult {
+    let frames: [CIImage]
+    let timestamps: [String]
+    let totalDuration: String
+}
+
+// TODO: verify working color space, rendering color space
 private let context = CIContext()
 
 /// Collection of methods for processing media (images, video, etc.).
@@ -87,6 +94,35 @@ public enum MediaProcessing {
         return rescaled.cropped(to: CGRect(origin: .zero, size: size))
     }
 
+    /// Resample the image using Lanczos interpolation.
+    static public func resampleLanczos(_ image: CIImage, to size: CGSize) -> CIImage {
+        let filter = CIFilter.lanczosScaleTransform()
+        let extent = image.extent.size
+
+        filter.inputImage = image
+
+        // set the aspect ratio to match the aspect ratio of the target
+        let inputAspectRatio = extent.width / extent.height
+        let desiredAspectRatio = size.width / size.height
+        filter.aspectRatio = Float(1 / inputAspectRatio * desiredAspectRatio)
+
+        // that image is now the aspect ratio of the target and the size
+        // of the shorter dimension
+        let scale: CGFloat
+        if extent.width < extent.height {
+            scale = size.width / extent.width
+        } else {
+            scale = size.height / extent.height
+        }
+        filter.scale = Float(scale)
+
+        let rescaled = filter.outputImage!
+
+        // the image has a DoD larger than the requested size so crop
+        // it to the desired size
+        return rescaled.cropped(to: CGRect(origin: .zero, size: size))
+    }
+
     /// Normalize the image using the given mean and standard deviation parameters.
     static public func normalize(
         _ image: CIImage, mean: (CGFloat, CGFloat, CGFloat), std: (CGFloat, CGFloat, CGFloat)
@@ -145,7 +181,7 @@ public enum MediaProcessing {
     }
 
     /// Apply `UserInput.Processing`, if needed, to the image.
-    static func apply(_ image: CIImage, processing: UserInput.Processing?) -> CIImage {
+    static public func apply(_ image: CIImage, processing: UserInput.Processing?) -> CIImage {
         var image = image
 
         if let resize = processing?.resize {
@@ -156,7 +192,8 @@ public enum MediaProcessing {
         return image
     }
 
-    static func asCIImageSequence(_ asset: AVAsset, samplesPerSecond: Int) async throws -> [CIImage]
+    static public func asCIImageSequence(_ asset: AVAsset, samplesPerSecond: Int) async throws
+        -> [CIImage]
     {
         // Use AVAssetImageGenerator to extract frames
         let generator = AVAssetImageGenerator(asset: asset)
@@ -199,4 +236,108 @@ public enum MediaProcessing {
 
         return ciImages
     }
+
+    static public func asCIImageSequence(
+        _ asset: AVAsset, maxFrames: Int, targetFPS: Double, skipSeconds: CMTime = .zero
+    ) async throws -> VideoFrameResult {
+        // Use AVAssetImageGenerator to extract frames
+        let generator = AVAssetImageGenerator(asset: asset)
+        generator.appliesPreferredTrackTransform = true
+        generator.requestedTimeToleranceBefore = .zero
+        generator.requestedTimeToleranceAfter = .zero
+
+        guard let duration = try? await asset.load(.duration) else {
+            throw NSError(
+                domain: "MediaProcessing", code: -1,
+                userInfo: [NSLocalizedDescriptionKey: "Failed to load the asset's duration"])
+        }
+        // 1 fps for duration >= 10s, apply a multiplier if smaller
+        let adjustedFPS = max((10 - 0.9 * duration.seconds) * targetFPS, 1)
+        let estimatedFrames = Int(round(adjustedFPS * duration.seconds))
+        var desiredFrames = min(estimatedFrames, maxFrames)
+        let finalFrameCount = max(desiredFrames, 1)
+
+        let durationTimeValue = duration.value
+        let timescale = duration.timescale
+        let startTimeValue =
+            skipSeconds.seconds > 0 ? Int64(skipSeconds.seconds * Double(timescale)) : 0
+        let endTimeValue =
+            skipSeconds.seconds > 0
+            ? Int64(duration.seconds * Double(timescale) - skipSeconds.seconds * Double(timescale))
+            : duration.value
+        let sampledTimeValues = MLXArray.linspace(
+            startTimeValue, endTimeValue, count: Int(finalFrameCount)
+        ).asArray(Int64.self)
+
+        let sampledTimes = sampledTimeValues.map { CMTime(value: $0, timescale: timescale) }
+
+        // Collect the frames
+        var ciImages: [CIImage] = []
+        var timestamps: [String] = []
+
+        for await result in await generator.images(for: sampledTimes) {
+            switch result {
+            case .success(requestedTime: let requested, let image, actualTime: let actual):
+                let ciImage = CIImage(
+                    cgImage: image, options: [.colorSpace: CGColorSpace(name: CGColorSpace.sRGB)!])
+                ciImages.append(ciImage)
+                timestamps.append(formatTimestamp(actual))
+            case .failure(requestedTime: let requested, let error):
+                break
+            }
+        }
+
+        let totalDuration = formatTimestamp(duration)
+
+        return VideoFrameResult(
+            frames: ciImages,
+            timestamps: timestamps,
+            totalDuration: totalDuration
+        )
+    }
+
+    private static func formatTimestamp(_ time: CMTime) -> String {
+        let totalSeconds = Int(ceil(time.seconds))
+        let hours = totalSeconds / 3600
+        let minutes = (totalSeconds % 3600) / 60
+        let seconds = totalSeconds % 60
+
+        return String(format: "%d:%02d:%02d", hours, minutes, seconds)
+    }
+}
+
+// MARK: - Convenience
+
+extension CIImage {
+    public enum ResamplingMethod {
+        case bicubic
+        case lanczos
+    }
+
+    public func resampled(to size: CGSize, method: ResamplingMethod = .bicubic) -> CIImage {
+        switch method {
+        case .bicubic:
+            return MediaProcessing.resampleBicubic(self, to: size)
+        case .lanczos:
+            return MediaProcessing.resampleLanczos(self, to: size)
+        }
+    }
+
+    public func toSRGB() -> CIImage {
+        return MediaProcessing.inSRGBToneCurveSpace(self)
+    }
+
+    public func toLinear() -> CIImage {
+        return MediaProcessing.inLinearToneCurveSpace(self)
+    }
+
+    public func normalized(mean: (CGFloat, CGFloat, CGFloat), std: (CGFloat, CGFloat, CGFloat))
+        -> CIImage
+    {
+        return MediaProcessing.normalize(self, mean: mean, std: std)
+    }
+
+    public func asMLXArray(colorSpace: CGColorSpace? = nil) -> MLXArray {
+        return MediaProcessing.asMLXArray(self, colorSpace: colorSpace)
+    }
 }