Merge branch 'main' of https://github.com/jafioti/luminal

jafioti · Apr 22, 2024 · 8fc4385 · 8fc4385
2 parents 5c99c9d + 3184d27
commit 8fc4385
Show file tree

Hide file tree

Showing 10 changed files with 129 additions and 188 deletions.
diff --git a/README.md b/README.md
@@ -28,9 +28,9 @@ println!("Result: {:?}", c);
 ```
 
 ## Getting Started
-**Mistral 7B**
+**Llama 3 8B**
 ```bash
-cd ./examples/mistral
+cd ./examples/llama
 # Download the model
 bash ./setup/setup.sh
 # Run the model
@@ -41,7 +41,7 @@ cargo run --release                     # CPU
 
 ## Features
 ### Speed
-Luminal can run Q8 Mistral 7B on M-series Macbooks at 15-25 tokens per second. The goal is to become the fastest ML framework for any model on any device.
+Luminal can run Q8 Llama 3 8B on M-series Macbooks at 15-25 tokens per second. The goal is to become the fastest ML framework for any model on any device.
 
 ### Simplicity
 The core of luminal is and always will be minimal. It should be possible to understand the entire core library in an afternoon.
@@ -88,7 +88,7 @@ Once you've written all your computation code, run `cx.display()` to see the ent
 ## Where are we?
 - Metal and Cuda are supported for running models on Macs and Nvidia GPUs respectively, in both full and half precision.
 - Performance on M-series macs with LLMs is within 20% of llama.cpp (a *heavily* optimized library)
-- Mistral 7B and Llama 7B are implemented in `examples/`. See instructions above for running.
+- Mistral 7B and Llama 8B are implemented in `examples/`. See instructions above for running.
 - We have a small library of NN modules in `nn`, including transformers.
 - A significant amount of high-level ops are implemented in `hl_ops`. We are aiming to match the most used ~80% of the pytorch api.
 - The aim for 0.3 is to achieve SOTA performance on an M1 pro (50 tok/s), and near SOTA on single nvidia gpus (>100 tok/s), as well as support many mainstream models (Whisper, Stable Diffusion, Yolo v9, etc.) See the tracking issue [here](https://github.com/jafioti/luminal/issues/29)

diff --git a/crates/luminal_cuda/Cargo.toml b/crates/luminal_cuda/Cargo.toml
@@ -16,8 +16,10 @@ luminal_cudarc = { version="0.10.0", features = [
 itertools = "0.12.1"
 rustc-hash = "1.1.0"
 num-traits = "0.2.18"
+fmt-derive = "0.1.1"
 
 [dev-dependencies]
 dfdx = { version = "0.13", features = ["f16"] }
 rand = "0.8.5"
-paste = "1.0.14"
+paste = "1.0.14"
+luminal_nn = {path="../../crates/luminal_nn"}
diff --git a/crates/luminal_cuda/src/binary.rs b/crates/luminal_cuda/src/binary.rs
@@ -1,5 +1,6 @@
 use std::{marker::PhantomData, sync::Arc};
 
+use fmt_derive::Debug;
 use luminal_cudarc::driver::{CudaDevice, CudaFunction, DeviceRepr, LaunchAsync, LaunchConfig};
 
 use luminal::{
@@ -15,7 +16,7 @@ use crate::{
     render_dyn_dim_inputs, CudaData, CudaFloat,
 };
 
-#[derive(LuminalEqTrue, LuminalPrint, Clone)]
+#[derive(Clone, Debug)]
 pub struct CudaSub<T> {
     function: CudaFunction,
     device: Arc<CudaDevice>,
@@ -81,7 +82,7 @@ impl<T: CudaFloat> Operator for CudaSub<T> {
     }
 }
 
-#[derive(LuminalPrint, Default)]
+#[derive(Debug, Default)]
 pub struct SubtractionCompiler<T: CudaFloat>(PhantomData<T>);
 
 impl<T: CudaFloat> Compiler for SubtractionCompiler<T> {
@@ -139,7 +140,7 @@ impl<T: CudaFloat> Compiler for SubtractionCompiler<T> {
     }
 }
 
-#[derive(LuminalEqTrue, LuminalPrint, Clone)]
+#[derive(Clone, Debug)]
 pub struct CudaEqual<T> {
     function: CudaFunction,
     device: Arc<CudaDevice>,
@@ -205,7 +206,7 @@ impl<T: CudaFloat> Operator for CudaEqual<T> {
     }
 }
 
-#[derive(LuminalPrint, Default)]
+#[derive(Debug, Default)]
 pub struct EqualCompiler<T: CudaFloat>(PhantomData<T>);
 
 impl<T: CudaFloat> Compiler for EqualCompiler<T> {
@@ -262,7 +263,7 @@ impl<T: CudaFloat> Compiler for EqualCompiler<T> {
     }
 }
 
-#[derive(LuminalPrint, Clone, LuminalEqFalse)]
+#[derive(Clone, Debug)]
 pub struct CudaGather<T> {
     function: CudaFunction,
     device: Arc<CudaDevice>,
@@ -294,13 +295,7 @@ extern \"C\" __global__ void kernel({type_name} *out, const {type_name} *weights
 impl<T: CudaFloat> Operator for CudaGather<T> {
     fn process(&mut self, inputs: Vec<(InputTensor, ShapeTracker)>) -> Vec<Tensor> {
         // Inp 1 should be Vec<f32> and inp 2 should be a CudaSlice<T>
-        let indexes = inputs[0]
-            .0
-            .borrowed()
-            .data
-            .as_any()
-            .downcast_ref::<Vec<f32>>()
-            .unwrap();
+        let indexes = inputs[0].0.borrowed().downcast_ref::<Vec<f32>>().unwrap();
         let weights = get_buffer_from_tensor::<T>(&inputs[1].0);
 
         let mut indexes_buffer = unsafe { self.device.alloc::<f32>(indexes.len()).unwrap() };
@@ -335,13 +330,11 @@ impl<T: CudaFloat> Operator for CudaGather<T> {
                 .unwrap();
         }
 
-        vec![Tensor {
-            data: Box::new(CudaData(out)),
-        }]
+        vec![Tensor::new(CudaData(out))]
     }
 }
 
-#[derive(LuminalPrint, Default)]
+#[derive(Debug, Default)]
 pub struct GatherCompiler<T: CudaFloat>(PhantomData<T>);
 
 impl<T: CudaFloat> Compiler for GatherCompiler<T> {

diff --git a/crates/luminal_cuda/src/lib.rs b/crates/luminal_cuda/src/lib.rs
@@ -20,8 +20,6 @@ use std::{collections::hash_map::DefaultHasher, ffi::c_void, fmt::Write, hash::H
 
 use luminal::{op::InputTensor, prelude::*};
 
-use self::symbolic::{BigExpression, Term};
-
 pub type CudaCompiler<T> = (
     prim::PrimitiveCompiler<T>,
     binary::SubtractionCompiler<T>,
@@ -80,16 +78,6 @@ impl<T: CudaFloat> Data for CudaData<T> {
     }
 }
 
-impl Data for CudaData<u8> {
-    fn as_any(&self) -> &dyn std::any::Any {
-        self
-    }
-
-    fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
-        self
-    }
-}
-
 impl CudaFloat for f16 {
     fn from_f32(a: f32) -> Self {
         f16::from_f32(a)
@@ -105,6 +93,21 @@ impl CudaFloat for f16 {
     }
 }
 
+impl CudaFloat for u8 {
+    fn from_f32(a: f32) -> Self {
+        a as u8
+    }
+    fn to_f32(self) -> f32 {
+        self as f32
+    }
+    fn is_f32() -> bool {
+        false
+    }
+    fn type_name() -> &'static str {
+        "uint8_t"
+    }
+}
+
 fn expr_to_cuda_string(expr: BigExpression) -> String {
     let mut symbols = vec![];
     for term in expr.terms {
@@ -195,14 +198,8 @@ fn hash<T: std::hash::Hash>(obj: T) -> u64 {
     hasher.finish()
 }
 
-fn get_buffer_from_tensor<'a, T: 'static>(tensor: &'a InputTensor) -> &'a CudaSlice<T> {
-    &tensor
-        .borrowed()
-        .data
-        .as_any()
-        .downcast_ref::<CudaData<T>>()
-        .unwrap()
-        .0
+fn get_buffer_from_tensor<'a, T: CudaFloat>(tensor: &'a InputTensor) -> &'a CudaSlice<T> {
+    &tensor.borrowed().downcast_ref::<CudaData<T>>().unwrap().0
 }
 
 fn input_dyn_dims(

diff --git a/crates/luminal_cuda/src/matmul.rs b/crates/luminal_cuda/src/matmul.rs
@@ -1,5 +1,6 @@
 use std::{marker::PhantomData, sync::Arc};
 
+use fmt_derive::Debug;
 use luminal_cudarc::{
     cublas::{sys::cublasOperation_t::*, CudaBlas},
     driver::{CudaDevice, DevicePtr, DevicePtrMut},
@@ -15,7 +16,7 @@ use luminal::{
     prelude::*,
 };
 
-#[derive(LuminalPrint, LuminalEqFalse, Clone)]
+#[derive(Clone, Debug)]
 pub struct Matmul<T>(Arc<CudaBlas>, Arc<CudaDevice>, PhantomData<T>);
 
 impl<T: CudaFloat> Operator for Matmul<T> {

diff --git a/crates/luminal_cuda/src/other.rs b/crates/luminal_cuda/src/other.rs
@@ -2,7 +2,8 @@ use std::{marker::PhantomData, sync::Arc};
 
 use luminal_cudarc::driver::{CudaDevice, CudaFunction, LaunchAsync, LaunchConfig};
 
-use luminal::{op::*, prelude::*, shape::symbolic::BigExpression};
+use fmt_derive::Debug;
+use luminal::prelude::*;
 use rustc_hash::FxHashMap;
 
 use crate::{
@@ -12,7 +13,7 @@ use crate::{
     CudaData, CudaFloat,
 };
 
-#[derive(LuminalPrint, Clone, LuminalEqFalse)]
+#[derive(Clone, Debug)]
 pub struct CudaARange<T> {
     function: CudaFunction,
     device: Arc<CudaDevice>,
@@ -65,13 +66,11 @@ impl<T: CudaFloat> Operator for CudaARange<T> {
                 .unwrap();
         }
 
-        vec![Tensor {
-            data: Box::new(CudaData(out)),
-        }]
+        vec![Tensor::new(CudaData(out))]
     }
 }
 
-#[derive(LuminalPrint, Default)]
+#[derive(Debug, Default)]
 pub struct ARangeCompiler<T: CudaFloat>(PhantomData<T>);
 
 impl<T: CudaFloat> Compiler for ARangeCompiler<T> {