-
Notifications
You must be signed in to change notification settings - Fork 177
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
VLM support for image and video processing with SmolVLM support #206
base: main
Are you sure you want to change the base?
Changes from all commits
b7c61ac
cc31f91
1ba603c
610a457
113f93d
dc6b71f
1cf5906
da3f80f
ad6f05c
7be743d
6b63fcf
521b927
0eaab62
76707dd
5f39269
ee2cd3b
cb22d0d
a94f419
0fe3a46
a831a12
7a6d2c6
c73bfe3
2807b88
d407259
b86cdf2
97ed22b
08b1e8c
6f5e2f4
9d7ad6e
ac482a3
481756e
b232921
00394f1
7d1934d
61a95b9
45fcaf8
261fd98
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,11 @@ import SwiftUI | |
typealias PlatformImage = NSImage | ||
#endif | ||
|
||
let videoSystemPrompt = | ||
"Focus only on describing the key dramatic action or notable event occurring in this video segment. Skip general context or scene-setting details unless they are crucial to understanding the main action." | ||
let imageSystemPrompt = | ||
"You are an image understanding model capable of describing the salient features of any image." | ||
|
||
struct ContentView: View { | ||
@State var prompt = "" | ||
@State var llm = VLMEvaluator() | ||
|
@@ -28,7 +33,7 @@ struct ContentView: View { | |
} | ||
} | ||
} | ||
@State private var selectedVideoURL: URL? = nil { | ||
@State private var selectedVideoURL: URL? { | ||
didSet { | ||
if let selectedVideoURL { | ||
player = AVPlayer(url: selectedVideoURL) | ||
|
@@ -61,7 +66,11 @@ struct ContentView: View { | |
} | ||
|
||
VStack { | ||
if let selectedImage { | ||
if let player { | ||
VideoPlayer(player: player) | ||
.frame(height: 300) | ||
.cornerRadius(12) | ||
} else if let selectedImage { | ||
Group { | ||
#if os(iOS) || os(visionOS) | ||
Image(uiImage: selectedImage) | ||
|
@@ -91,11 +100,6 @@ struct ContentView: View { | |
EmptyView() | ||
} | ||
} | ||
} else if let player { | ||
VideoPlayer(player: player) | ||
.scaledToFit() | ||
.frame(maxHeight: 300) | ||
.cornerRadius(12) | ||
} | ||
|
||
HStack { | ||
|
@@ -193,6 +197,7 @@ struct ContentView: View { | |
.id("bottom") | ||
} | ||
} | ||
.frame(minHeight: 200) | ||
|
||
HStack { | ||
TextField("prompt", text: $prompt) | ||
|
@@ -205,6 +210,9 @@ struct ContentView: View { | |
.disabled(llm.running) | ||
} | ||
} | ||
.onAppear { | ||
selectedVideoURL = Bundle.main.url(forResource: "test", withExtension: "mp4")! | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is nice for testing but I think we should probably remove the example asset for the example -- force people use their own images & videos. Also, I don't know the license on this video :-) On the other hand this is meant as an example for developers to build on and maybe it is good to have something ready to go? Anyone have any thoughts on this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm fine either way. cc @cyrilzakka on the video rights (but also for opinion) :) |
||
#if os(visionOS) | ||
.padding(40) | ||
#else | ||
|
@@ -322,10 +330,10 @@ class VLMEvaluator { | |
|
||
/// This controls which model loads. `qwen2VL2BInstruct4Bit` is one of the smaller ones, so this will fit on | ||
/// more devices. | ||
let modelConfiguration = ModelRegistry.qwen2VL2BInstruct4Bit | ||
let modelConfiguration = ModelRegistry.smolvlm | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should revert this before merging or if we think this is a better default model, update the comment. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, will revert, this was meant for our own testing. |
||
|
||
/// parameters controlling the output | ||
let generateParameters = MLXLMCommon.GenerateParameters(temperature: 0.6) | ||
let generateParameters = MLXLMCommon.GenerateParameters(temperature: 0.7, topP: 0.9) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These parameters are also smolvlm-specific. |
||
let maxTokens = 800 | ||
|
||
/// update the display every N tokens -- 4 looks like it updates continuously | ||
|
@@ -401,7 +409,10 @@ class VLMEvaluator { | |
[ | ||
"role": "user", | ||
"content": [ | ||
["type": "text", "text": prompt] | ||
[ | ||
"type": "text", | ||
"text": videoURL != nil ? videoSystemPrompt : imageSystemPrompt, | ||
] | ||
] | ||
// Messages format for Qwen 2 VL, Qwen 2.5 VL. May need to be adapted for other models. | ||
+ images.map { _ in | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,13 @@ import CoreImage.CIFilterBuiltins | |
import MLX | ||
import MLXLMCommon | ||
|
||
public struct VideoFrameResult { | ||
let frames: [CIImage] | ||
let timestamps: [String] | ||
let totalDuration: String | ||
} | ||
|
||
// TODO: verify working color space, rendering color space | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a good idea. I think the python processing code is roughly equivalent to the colorspace of the input, no conversion to linear, and what is called "device RGB" (don't touch my colors). In other words it isn't managed color, but that is what we have here. We could certainly do something like use the non-linear form of the input colorspace and output to the same. In practice I am not sure it matters that much. These models are probably trained on consistent colorspace inputs (though sRGB is likely, displayP3 from iPhone images is pretty likely, and videos are much more diverse). Maybe this should turn into an issue? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That said: I don't think we should try to replicate the unmanaged colorspace of the python version. I think we should pick a colorspace (sRGB or displayP3) and be consistent. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, makes sense to turn into an issue. I also think that Python pre-processing is mostly oblivious to colorspace. |
||
private let context = CIContext() | ||
|
||
/// Collection of methods for processing media (images, video, etc.). | ||
|
@@ -87,6 +94,35 @@ public enum MediaProcessing { | |
return rescaled.cropped(to: CGRect(origin: .zero, size: size)) | ||
} | ||
|
||
/// Resample the image using Lanczos interpolation. | ||
static public func resampleLanczos(_ image: CIImage, to size: CGSize) -> CIImage { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Smol uses Lanczos? I agree it is the better resampling method for humans, but the sinc it simulates has an edge strengthening effect -- I am surprised to see it used here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it does, I was surprised too when I saw it but didn't follow up with the team. cc @mfarre, just curious if there's any insight :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is inherited from the Idefics3 image processor :) |
||
let filter = CIFilter.lanczosScaleTransform() | ||
let extent = image.extent.size | ||
|
||
filter.inputImage = image | ||
|
||
// set the aspect ratio to match the aspect ratio of the target | ||
let inputAspectRatio = extent.width / extent.height | ||
let desiredAspectRatio = size.width / size.height | ||
filter.aspectRatio = Float(1 / inputAspectRatio * desiredAspectRatio) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if this size/aspect ratio code should be refactored to be shared between the resampling methods? |
||
|
||
// that image is now the aspect ratio of the target and the size | ||
// of the shorter dimension | ||
let scale: CGFloat | ||
if extent.width < extent.height { | ||
scale = size.width / extent.width | ||
} else { | ||
scale = size.height / extent.height | ||
} | ||
filter.scale = Float(scale) | ||
|
||
let rescaled = filter.outputImage! | ||
|
||
// the image has a DoD larger than the requested size so crop | ||
// it to the desired size | ||
return rescaled.cropped(to: CGRect(origin: .zero, size: size)) | ||
} | ||
|
||
/// Normalize the image using the given mean and standard deviation parameters. | ||
static public func normalize( | ||
_ image: CIImage, mean: (CGFloat, CGFloat, CGFloat), std: (CGFloat, CGFloat, CGFloat) | ||
|
@@ -145,7 +181,7 @@ public enum MediaProcessing { | |
} | ||
|
||
/// Apply `UserInput.Processing`, if needed, to the image. | ||
static func apply(_ image: CIImage, processing: UserInput.Processing?) -> CIImage { | ||
static public func apply(_ image: CIImage, processing: UserInput.Processing?) -> CIImage { | ||
var image = image | ||
|
||
if let resize = processing?.resize { | ||
|
@@ -156,7 +192,8 @@ public enum MediaProcessing { | |
return image | ||
} | ||
|
||
static func asCIImageSequence(_ asset: AVAsset, samplesPerSecond: Int) async throws -> [CIImage] | ||
static public func asCIImageSequence(_ asset: AVAsset, samplesPerSecond: Int) async throws | ||
-> [CIImage] | ||
{ | ||
// Use AVAssetImageGenerator to extract frames | ||
let generator = AVAssetImageGenerator(asset: asset) | ||
|
@@ -199,4 +236,108 @@ public enum MediaProcessing { | |
|
||
return ciImages | ||
} | ||
|
||
static public func asCIImageSequence( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is pretty similar to the method above it. I wonder if we should have a See also #223 -- maybe we can factor this part out as some of the other PRs would make use of it. |
||
_ asset: AVAsset, maxFrames: Int, targetFPS: Double, skipSeconds: CMTime = .zero | ||
) async throws -> VideoFrameResult { | ||
// Use AVAssetImageGenerator to extract frames | ||
let generator = AVAssetImageGenerator(asset: asset) | ||
generator.appliesPreferredTrackTransform = true | ||
generator.requestedTimeToleranceBefore = .zero | ||
generator.requestedTimeToleranceAfter = .zero | ||
|
||
guard let duration = try? await asset.load(.duration) else { | ||
throw NSError( | ||
domain: "MediaProcessing", code: -1, | ||
userInfo: [NSLocalizedDescriptionKey: "Failed to load the asset's duration"]) | ||
} | ||
// 1 fps for duration >= 10s, apply a multiplier if smaller | ||
let adjustedFPS = max((10 - 0.9 * duration.seconds) * targetFPS, 1) | ||
let estimatedFrames = Int(round(adjustedFPS * duration.seconds)) | ||
var desiredFrames = min(estimatedFrames, maxFrames) | ||
let finalFrameCount = max(desiredFrames, 1) | ||
|
||
let durationTimeValue = duration.value | ||
let timescale = duration.timescale | ||
let startTimeValue = | ||
skipSeconds.seconds > 0 ? Int64(skipSeconds.seconds * Double(timescale)) : 0 | ||
let endTimeValue = | ||
skipSeconds.seconds > 0 | ||
? Int64(duration.seconds * Double(timescale) - skipSeconds.seconds * Double(timescale)) | ||
: duration.value | ||
let sampledTimeValues = MLXArray.linspace( | ||
startTimeValue, endTimeValue, count: Int(finalFrameCount) | ||
).asArray(Int64.self) | ||
|
||
let sampledTimes = sampledTimeValues.map { CMTime(value: $0, timescale: timescale) } | ||
|
||
// Collect the frames | ||
var ciImages: [CIImage] = [] | ||
var timestamps: [String] = [] | ||
|
||
for await result in await generator.images(for: sampledTimes) { | ||
switch result { | ||
case .success(requestedTime: let requested, let image, actualTime: let actual): | ||
let ciImage = CIImage( | ||
cgImage: image, options: [.colorSpace: CGColorSpace(name: CGColorSpace.sRGB)!]) | ||
ciImages.append(ciImage) | ||
timestamps.append(formatTimestamp(actual)) | ||
case .failure(requestedTime: let requested, let error): | ||
break | ||
} | ||
} | ||
|
||
let totalDuration = formatTimestamp(duration) | ||
|
||
return VideoFrameResult( | ||
frames: ciImages, | ||
timestamps: timestamps, | ||
totalDuration: totalDuration | ||
) | ||
} | ||
|
||
private static func formatTimestamp(_ time: CMTime) -> String { | ||
let totalSeconds = Int(ceil(time.seconds)) | ||
let hours = totalSeconds / 3600 | ||
let minutes = (totalSeconds % 3600) / 60 | ||
let seconds = totalSeconds % 60 | ||
|
||
return String(format: "%d:%02d:%02d", hours, minutes, seconds) | ||
} | ||
} | ||
|
||
// MARK: - Convenience | ||
|
||
extension CIImage { | ||
public enum ResamplingMethod { | ||
case bicubic | ||
case lanczos | ||
} | ||
|
||
public func resampled(to size: CGSize, method: ResamplingMethod = .bicubic) -> CIImage { | ||
switch method { | ||
case .bicubic: | ||
return MediaProcessing.resampleBicubic(self, to: size) | ||
case .lanczos: | ||
return MediaProcessing.resampleLanczos(self, to: size) | ||
} | ||
} | ||
|
||
public func toSRGB() -> CIImage { | ||
return MediaProcessing.inSRGBToneCurveSpace(self) | ||
} | ||
|
||
public func toLinear() -> CIImage { | ||
return MediaProcessing.inLinearToneCurveSpace(self) | ||
} | ||
|
||
public func normalized(mean: (CGFloat, CGFloat, CGFloat), std: (CGFloat, CGFloat, CGFloat)) | ||
-> CIImage | ||
{ | ||
return MediaProcessing.normalize(self, mean: mean, std: std) | ||
} | ||
|
||
public func asMLXArray(colorSpace: CGColorSpace? = nil) -> MLXArray { | ||
return MediaProcessing.asMLXArray(self, colorSpace: colorSpace) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think these are fine for now, but this + the message formatting needs to be figured out (later :-) )
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed!