From 1e3d725ecd6f06f873e1784d36e5b58654975d14 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Sun, 21 Jan 2024 11:46:57 +0000
Subject: [PATCH 01/37] chore: first pass pytorch integration

---
 .gitignore                             |   1 +
 crates/ratchet-core/Cargo.toml         |  12 ++-
 crates/ratchet-core/src/storage/mod.rs |  10 +++
 crates/ratchet-core/src/tensor.rs      | 113 ++++++++++++++++++++++++-
 justfile                               |   6 ++
 5 files changed, 139 insertions(+), 3 deletions(-)
diff --git a/.gitignore b/.gitignore
index 6985cf1b..ce0eaee0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,4 @@ Cargo.lock
 
 # MSVC Windows builds of rustc generate these, which store debugging information
 *.pdb
+.python-version
diff --git a/crates/ratchet-core/Cargo.toml b/crates/ratchet-core/Cargo.toml
index 97b1619b..f2099d76 100644
--- a/crates/ratchet-core/Cargo.toml
+++ b/crates/ratchet-core/Cargo.toml
@@ -4,7 +4,8 @@ version = "0.1.0"
 edition = "2021"
 
 [features]
-default = ["rand"]
+default = ["rand", "pyo3"]
+pyo3 = ["dep:pyo3", "dep:numpy", "dep:ndarray"]
 gpu_profiling = []
 rand = ["dep:rand", "dep:rand_distr"]
 
@@ -36,5 +37,14 @@ rand_distr = { version = "0.4.3", optional = true }
 rand = { version = "0.8.4", optional = true }
 lazy_static = "1.4.0"
 
+# Python bindings
+pyo3 = { version = "0.20.2", features=["auto-initialize"], optional = true }
+numpy = { version = "0.20.0", optional = true }
+ndarray = { version = "0.15.6", optional = true }
+
 [dev-dependencies]
 rand = "0.8.4"
+pyo3 = { version = "0.20.2", features=["auto-initialize"] }
+numpy = { version = "0.20.0" }
+ndarray = { version = "0.15.6" }
+
diff --git a/crates/ratchet-core/src/storage/mod.rs b/crates/ratchet-core/src/storage/mod.rs
index 0620beff..f998dc16 100644
--- a/crates/ratchet-core/src/storage/mod.rs
+++ b/crates/ratchet-core/src/storage/mod.rs
@@ -52,6 +52,16 @@ impl Storage {
         }
     }
 
+    pub fn try_cpu(&self) -> Result<&RawCPUBuffer, DeviceError> {
+        match self.raw.as_ref() {
+            Some(RawStorage::CPU(raw)) => Ok(raw),
+            _ => Err(DeviceError::DeviceMismatch(
+                "CPU".to_string(),
+                "GPU".to_string(),
+            )),
+        }
+    }
+
     pub fn dump(&self, dtype: DType, full: bool) -> String {
         self.raw
             .as_ref()
diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index ef72021d..da2e77ea 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -1,7 +1,7 @@
 use crate::gpu::{CpuUniform, WgpuDevice};
 use crate::{
     ops::*, CompiledOp, DType, Device, DeviceStorage, Executable, Operation, OperationError,
-    RawStorage, Shape, Storage, Strides, TensorDType, TensorId,
+    RawCPUBuffer, RawStorage, Shape, Storage, Strides, TensorDType, TensorId,
 };
 use crate::{BinaryOp, LazyOp};
 
@@ -12,6 +12,12 @@ use std::sync::Arc;
 #[cfg(feature = "rand")]
 use {rand::prelude::*, rand_distr::StandardNormal};
 
+#[cfg(feature = "pyo3")]
+use {
+    ndarray::{ArrayD, ArrayViewD},
+    numpy::PyArrayDyn,
+};
+
 // thiserror error for Tensor
 #[derive(thiserror::Error, Debug)]
 pub enum TensorError {
@@ -295,16 +301,87 @@ impl Tensor {
             _ => Ok(self.clone()),
         }
     }
+
+    #[cfg(feature = "pyo3")]
+    pub fn into_ndarray<T: TensorDType>(&self) -> ArrayD<T> {
+        assert!(self.device().is_cpu());
+        let storage = self.storage().try_read().unwrap();
+        let raw_cpu = storage.try_cpu().unwrap();
+        let shape = self.shape().to_vec();
+        if self.num_bytes() != 0 {
+            let ptr = raw_cpu.inner().0 as *const T;
+            unsafe { ArrayViewD::from_shape_ptr(shape, ptr).to_owned() }
+        } else {
+            ArrayViewD::from_shape(shape, &[]).unwrap().to_owned()
+        }
+    }
+
+    #[cfg(feature = "pyo3")]
+    pub fn to_py<'s, 'p: 's, T: TensorDType + numpy::Element>(
+        &'s self,
+        py: &'p pyo3::Python<'p>,
+    ) -> &PyArrayDyn<T> {
+        use numpy::PyArray;
+        PyArray::from_owned_array(*py, self.clone().into_ndarray::<T>())
+    }
+}
+
+#[cfg(feature = "pyo3")]
+impl<T: TensorDType> From<ArrayD<T>> for Tensor {
+    fn from(it: ArrayD<T>) -> Self {
+        if it.as_slice().is_some() {
+            let layout = std::alloc::Layout::from_size_align(
+                it.len() * std::mem::size_of::<T>(),
+                std::mem::align_of::<T>(),
+            )
+            .unwrap();
+            let shape = it.shape().to_vec().into();
+            let strides = Strides::from(&shape);
+            let vec = it.into_raw_vec().into_boxed_slice();
+            let ptr = Box::into_raw(vec) as *mut u8;
+
+            let raw_buf = RawCPUBuffer::new(ptr, layout);
+            let storage = Storage::from(RawStorage::CPU(raw_buf));
+            let meta = StorageView::new(shape, T::dt(), strides);
+            Tensor::new(LazyOp::Const, meta, storage, Device::CPU)
+        } else {
+            panic!("Cannot convert numpy array with non-contiguous memory layout to tensor");
+        }
+    }
+}
+
+#[cfg(feature = "pyo3")]
+impl<T: TensorDType + numpy::Element> From<&PyArrayDyn<T>> for Tensor {
+    fn from(array: &PyArrayDyn<T>) -> Self {
+        Self::from(array.to_owned_array())
+    }
 }
 
 #[cfg(test)]
 mod tests {
+    use pyo3::{types::PyModule, Python};
+
     use crate::{shape, DeviceRequest};
 
     use super::*;
 
     #[test]
-    fn test_cfg() -> anyhow::Result<()> {
+    fn test_matmul() -> anyhow::Result<()> {
+        let device = Device::request_device(DeviceRequest::GPU)?;
+        let a = Tensor::randn::<f32>(shape![1024, 1024], device.clone());
+        let b = Tensor::randn::<f32>(shape![1024, 1024], device.clone());
+        let c = a.matmul(&b)?;
+        c.resolve()?;
+        println!("\nA: {:#?}", a);
+        println!("\nB: {:#?}", b);
+        println!("\nC: {:#?}", c);
+        let d = c.to(Device::CPU)?;
+        println!("\nD: {:#?}", d);
+        Ok(())
+    }
+
+    #[test]
+    fn test_pyo3() -> anyhow::Result<()> {
         let device = Device::request_device(DeviceRequest::GPU)?;
         let a = Tensor::randn::<f32>(shape![1024, 1024], device.clone());
         let b = Tensor::randn::<f32>(shape![1024, 1024], device.clone());
@@ -315,6 +392,38 @@ mod tests {
         println!("\nC: {:#?}", c);
         let d = c.to(Device::CPU)?;
         println!("\nD: {:#?}", d);
+
+        let a = a.to(Device::CPU)?;
+        let b = b.to(Device::CPU)?;
+        let c = Python::with_gil(|py| {
+            let npy_a = a.to_py::<f32>(&py);
+            let npy_b = b.to_py::<f32>(&py);
+
+            let activators = PyModule::from_code(
+                py,
+                r#"
+import numpy as np
+import torch
+
+def matmul(a, b):
+    return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
+"#,
+                "x.py",
+                "x",
+            )
+            .unwrap();
+
+            let result = activators
+                .getattr("matmul")
+                .unwrap()
+                .call1((npy_a, npy_b))
+                .unwrap()
+                .extract::<&PyArrayDyn<f32>>()
+                .unwrap();
+            Tensor::from(result)
+        });
+        println!("\nC: {:#?}", c);
+
         Ok(())
     }
 }
diff --git a/justfile b/justfile
index 99357e51..107577c3 100644
--- a/justfile
+++ b/justfile
@@ -1,2 +1,8 @@
 line-count:
     cd ./crates/ratchet-core && scc -irs --exclude-file kernels
+install-pyo3:
+    env PYTHON_CONFIGURE_OPTS="--enable-shared" pyenv install --verbose 3.10.6
+    echo "Please PYO3_PYTHON to your .bashrc or .zshrc"
+wasm CRATE:
+    RUSTFLAGS=--cfg=web_sys_unstable_apis wasm-pack build --target web -d `pwd`/target/pkg/{{CRATE}} --out-name {{CRATE}} ./crates/{{CRATE}} --release 
+

From 0afb27d46ffe2ddb2521ea8c7fc69a5f1dada6b2 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Sun, 21 Jan 2024 11:51:15 +0000
Subject: [PATCH 02/37] chore: cleaning

---
 crates/ratchet-core/src/tensor.rs | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index da2e77ea..97f57639 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -395,14 +395,10 @@ mod tests {
 
         let a = a.to(Device::CPU)?;
         let b = b.to(Device::CPU)?;
-        let c = Python::with_gil(|py| {
-            let npy_a = a.to_py::<f32>(&py);
-            let npy_b = b.to_py::<f32>(&py);
-
-            let activators = PyModule::from_code(
+        let c: anyhow::Result<Tensor> = Python::with_gil(|py| {
+            let prg = PyModule::from_code(
                 py,
                 r#"
-import numpy as np
 import torch
 
 def matmul(a, b):
@@ -410,19 +406,15 @@ def matmul(a, b):
 "#,
                 "x.py",
                 "x",
-            )
-            .unwrap();
+            )?;
 
-            let result = activators
-                .getattr("matmul")
-                .unwrap()
-                .call1((npy_a, npy_b))
-                .unwrap()
-                .extract::<&PyArrayDyn<f32>>()
-                .unwrap();
-            Tensor::from(result)
+            let result = prg
+                .getattr("matmul")?
+                .call1((a.to_py::<f32>(&py), b.to_py::<f32>(&py)))?
+                .extract::<&PyArrayDyn<f32>>()?;
+            Ok(Tensor::from(result))
         });
-        println!("\nC: {:#?}", c);
+        println!("\nTORCH: {:#?}", c);
 
         Ok(())
     }

From 8c2b54d6ab89f92eec495d79f994427722a37a32 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Sun, 21 Jan 2024 11:54:14 +0000
Subject: [PATCH 03/37] chore: add python to CI for tests

---
 .github/workflows/rust.yml | 6 ++++++
 requirements.txt           | 3 +++
 2 files changed, 9 insertions(+)
 create mode 100644 requirements.txt

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index ab38dbfd..cdaea5e8 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -33,6 +33,12 @@ jobs:
           sudo apt-get update
           sudo apt install -y libegl1-mesa libgl1-mesa-dri libxcb-xfixes0-dev vulkan-sdk mesa-vulkan-drivers pkg-config libasound2-dev
 
+      - name: Setup python 
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10.6'
+          cache: 'pip'
+      - run: pip install -r requirements.txt
       - name: Setup
         run: |
           cargo install wasm-pack
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..cbd0e90e
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+--extra-index-url https://download.pytorch.org/whl/cpu
+numpy==1.24.3
+torch==2.0.1

From 78e3289f61c2c29a9b7fd1538fe7b8eb0f6e67b9 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Sun, 21 Jan 2024 12:18:15 +0000
Subject: [PATCH 04/37] chore: sigsev

---
 .github/workflows/rust.yml       | 1 +
 Cargo.toml                       | 3 ---
 crates/ratchet-loader/Cargo.toml | 4 ++--
 crates/ratchet-models/Cargo.toml | 2 +-
 4 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index cdaea5e8..7c62f447 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -10,6 +10,7 @@ env:
   CARGO_TERM_COLOR: always
   WGPU_DX12_COMPILER: dxc
   RUSTFLAGS: --cfg=web_sys_unstable_apis
+  RUST_BACKTRACE: 1
 
 jobs:
   build:
diff --git a/Cargo.toml b/Cargo.toml
index 8d0becb9..20543cea 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -28,6 +28,3 @@ derive-new = "0.6.0"
 log = "0.4.20"
 thiserror = "1.0.56"
 byteorder = "1.5.0"
-
-[workspace.dev-dependencies]
-hf-hub = "0.3.0"
diff --git a/crates/ratchet-loader/Cargo.toml b/crates/ratchet-loader/Cargo.toml
index 3df25b7c..48be9dee 100644
--- a/crates/ratchet-loader/Cargo.toml
+++ b/crates/ratchet-loader/Cargo.toml
@@ -6,8 +6,8 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-half.workspace = true
 ratchet = { path = "../ratchet-core" }
+half.workspace = true
 byteorder.workspace = true
 anyhow.workspace = true
 bytemuck.workspace = true
@@ -16,4 +16,4 @@ derive-new.workspace = true
 log.workspace = true
 
 [dev-dependencies]
-hf-hub = "0.3.2"
+hf-hub = "0.3.2" 
diff --git a/crates/ratchet-models/Cargo.toml b/crates/ratchet-models/Cargo.toml
index e88ae67d..c1e293d8 100644
--- a/crates/ratchet-models/Cargo.toml
+++ b/crates/ratchet-models/Cargo.toml
@@ -15,5 +15,5 @@ derive-new.workspace = true
 log.workspace = true
 
 [dev-dependencies]
-hf-hub = { version = "0.3.0" }
+hf-hub = "0.3.2"
 

From 77d7d390a051b29a39c4f61d88270429d8636319 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Sun, 21 Jan 2024 12:29:58 +0000
Subject: [PATCH 05/37] chore: sigsev

---
 crates/ratchet-core/src/tensor.rs | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index 97f57639..89bbbbd5 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -382,19 +382,9 @@ mod tests {
 
     #[test]
     fn test_pyo3() -> anyhow::Result<()> {
-        let device = Device::request_device(DeviceRequest::GPU)?;
+        let device = Device::request_device(DeviceRequest::CPU)?;
         let a = Tensor::randn::<f32>(shape![1024, 1024], device.clone());
         let b = Tensor::randn::<f32>(shape![1024, 1024], device.clone());
-        let c = a.matmul(&b)?;
-        c.resolve()?;
-        println!("\nA: {:#?}", a);
-        println!("\nB: {:#?}", b);
-        println!("\nC: {:#?}", c);
-        let d = c.to(Device::CPU)?;
-        println!("\nD: {:#?}", d);
-
-        let a = a.to(Device::CPU)?;
-        let b = b.to(Device::CPU)?;
         let c: anyhow::Result<Tensor> = Python::with_gil(|py| {
             let prg = PyModule::from_code(
                 py,

From ca8b76a7de613e7a13c0c072a8b8ed6ff88ee33f Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Sun, 21 Jan 2024 12:45:25 +0000
Subject: [PATCH 06/37] chore: double fre

---
 crates/ratchet-core/src/device.rs             | 2 ++
 crates/ratchet-core/src/storage/gpu_buffer.rs | 9 ++-------
 crates/ratchet-core/src/tensor.rs             | 5 +++--
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/crates/ratchet-core/src/device.rs b/crates/ratchet-core/src/device.rs
index 368db437..d30e8da1 100644
--- a/crates/ratchet-core/src/device.rs
+++ b/crates/ratchet-core/src/device.rs
@@ -14,6 +14,8 @@ pub enum DeviceError {
     BufferAllocationFailed(#[from] AllocatorError),
     #[error("Invalid GPU Buffer Usage, current: {0:?}, required: {1:?}")]
     InvalidBufferUsage(wgpu::BufferUsages, wgpu::BufferUsages),
+    #[error("Failed to transfer buffer with error: {0:?}")]
+    BufferTransferFailed(#[from] wgpu::BufferAsyncError),
 }
 
 pub enum DeviceRequest {
diff --git a/crates/ratchet-core/src/storage/gpu_buffer.rs b/crates/ratchet-core/src/storage/gpu_buffer.rs
index e7aa856a..a5baedc7 100644
--- a/crates/ratchet-core/src/storage/gpu_buffer.rs
+++ b/crates/ratchet-core/src/storage/gpu_buffer.rs
@@ -91,7 +91,7 @@ impl DeviceStorage for RawGPUBuffer {
         self.validate_usages(BufferUsages::COPY_SRC)?;
         let device = device.try_gpu()?;
         let buffer_slice = self.inner.slice(..);
-        let (tx, rx) = futures_intrusive::channel::shared::oneshot_channel();
+        let (tx, rx) = std::sync::mpsc::channel();
         let alignment = self.alignment;
 
         wgpu::util::DownloadBuffer::read_buffer(
@@ -107,12 +107,7 @@ impl DeviceStorage for RawGPUBuffer {
             },
         );
         device.poll(wgpu::Maintain::Wait);
-        //TODO: fix unwrap
-        let storage = pollster::block_on(async { rx.receive().await })
-            .ok_or(TensorError::TransferError)
-            .unwrap()
-            .map_err(|_| TensorError::TransferError)
-            .unwrap();
+        let storage = rx.recv().unwrap()?;
 
         Ok(storage)
     }
diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index 89bbbbd5..683e5dce 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -246,8 +246,8 @@ impl Tensor {
 
     pub fn compile(&self, uniform: &mut CpuUniform, device: &WgpuDevice) -> Option<CompiledOp> {
         match self.op() {
-            LazyOp::Binary(b) => Some(b.compile(self, uniform, device).unwrap()),
-            LazyOp::Matmul(m) => Some(m.compile(self, uniform, device).unwrap()),
+            LazyOp::Binary(b) => b.compile(self, uniform, device).ok(),
+            LazyOp::Matmul(m) => m.compile(self, uniform, device).ok(),
             LazyOp::Const => None,
             _ => unimplemented!(),
         }
@@ -338,6 +338,7 @@ impl<T: TensorDType> From<ArrayD<T>> for Tensor {
             let shape = it.shape().to_vec().into();
             let strides = Strides::from(&shape);
             let vec = it.into_raw_vec().into_boxed_slice();
+            //This is causing a double free
             let ptr = Box::into_raw(vec) as *mut u8;
 
             let raw_buf = RawCPUBuffer::new(ptr, layout);

From 6a98d051f5284b2d2de6a4ebfaa80401070676e1 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Sun, 21 Jan 2024 16:38:58 +0000
Subject: [PATCH 07/37] chore: double free

---
 ARCHITECTURE.md                               |  13 +-
 crates/ratchet-core/src/compiled_op.rs        |   5 +-
 .../ratchet-core/src/gpu/buffer_allocator.rs  |  25 ++-
 crates/ratchet-core/src/gpu/device.rs         |  12 +-
 .../src/gpu/pools/bind_group_pool.rs          |   4 +-
 .../ratchet-core/src/gpu/pools/buffer_pool.rs |  12 +-
 crates/ratchet-core/src/gpu/uniform.rs        |   4 +-
 crates/ratchet-core/src/lib.rs                |   1 +
 crates/ratchet-core/src/storage/cpu_buffer.rs | 116 ++++++++-----
 crates/ratchet-core/src/storage/gpu_buffer.rs |  47 ++----
 crates/ratchet-core/src/storage/mod.rs        | 106 +++---------
 crates/ratchet-core/src/tensor.rs             | 152 ++++++++++++------
 12 files changed, 257 insertions(+), 240 deletions(-)

diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
index 8196351c..99b36fab 100644
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -10,17 +10,16 @@ Ratchet is designed for 1 thing only: **Inference on WebGPU**.
 
 This leads us to a few design decisions:
 1. Ratchet is **lazy**, no computation is done until the entire computation graph is built and executed. This aligns closely with CUDAGraphs & Command buffers.
-2. Ratchet supports **BOTH** static & dynamic graphs, this is key.
-    - The graph is implicitly defined through tensor operations. If any of the tensors are defined with a *symbolic dimension* (i.e a dimension not known until runtime, e.g sequence_len), the graph is dynamic. When the graph is dynamic, the graph is recompiled on inference pass (because runtime information is required).
-    - If no tensors contain a symbolic dimension, the graph is static. This means the graph is compiled into a single command buffer, and is repeatedly called with different input data (brrr).
-    
-    By exposing symbolic dimensions to the user, they can code their models with the CG in mind.
+2. Ratchet supports **BOTH** static & dynamic graphs, see [Unified Graph Execution by Jittor](http://scis.scichina.com/en/2020/222103.pdf) for more details.
 3. Memory planning is crucial. Creation and first bind of a buffer is *expensive* in WebGPU. Therefore, Ratchet uses a greedy algorithm to pool buffers for intermediate results of the CFG.
 
-Why do this? 
-
 Take for example Whisper from OpenAI. This is an encoder-decoder model, where the encoder is completely static (i.e everything is known at compile time), and the decoder is very dynamic (KV caching, seq_len increments every step). By allowing both paradigms, we can maximise performance.
 
+## Memory Management
+
+Ratchets top level `Tensor` is just an `Arc` around the `Inner`. Tensors should be cheaply cloneable.
+`Inner` contains a struct `Storage`, this is an enum around our 2 managed structures for CPU & GPU: `CpuStorage` & `GpuStorage`.
+`CpuStorage` is an `Arc<RwLock<RawCPUBuffer>>`, and `GpuStorage` is an `Arc<RwLock<Buffer>>`.
 
 
 ## Quantization
diff --git a/crates/ratchet-core/src/compiled_op.rs b/crates/ratchet-core/src/compiled_op.rs
index 32226d56..1fc2904c 100644
--- a/crates/ratchet-core/src/compiled_op.rs
+++ b/crates/ratchet-core/src/compiled_op.rs
@@ -29,8 +29,9 @@ impl CompiledOp {
         let mut bind_group_entries = drvec![];
 
         for tensor in srcs.iter().chain(std::iter::once(&dst)) {
-            let buf = tensor.storage().try_read().unwrap();
-            let gpu_buf = &buf.try_gpu().unwrap().inner;
+            let storage_guard = tensor.storage();
+            let storage = storage_guard.as_ref().unwrap();
+            let gpu_buf = &storage.try_gpu().unwrap().inner;
             bind_group_entries.push(BindGroupEntry {
                 handle: gpu_buf.handle,
                 offset: 0,
diff --git a/crates/ratchet-core/src/gpu/buffer_allocator.rs b/crates/ratchet-core/src/gpu/buffer_allocator.rs
index 3988bff7..e6017fb2 100644
--- a/crates/ratchet-core/src/gpu/buffer_allocator.rs
+++ b/crates/ratchet-core/src/gpu/buffer_allocator.rs
@@ -2,7 +2,7 @@ use rustc_hash::FxHashMap;
 use wgpu::BufferUsages;
 
 use crate::{
-    gpu::{BufferDescriptor, BufferPool, GPUBuffer, GpuBufferHandle},
+    gpu::{BufferDescriptor, BufferPool, GpuBufferHandle, PooledGPUBuffer},
     DeviceError, Tensor, TensorId,
 };
 use std::cell::{Ref, RefCell, RefMut};
@@ -31,7 +31,7 @@ impl BufferAllocator {
         self.pool.borrow_mut().begin_pass(pass_index);
     }
 
-    pub fn get(&self, handle: GpuBufferHandle) -> GPUBuffer {
+    pub fn get(&self, handle: GpuBufferHandle) -> PooledGPUBuffer {
         self.pool.borrow().get(handle).unwrap()
     }
 
@@ -43,7 +43,7 @@ impl BufferAllocator {
         self.pool.borrow_mut()
     }
 
-    pub fn create_buffer(&self, desc: &BufferDescriptor, device: &WgpuDevice) -> GPUBuffer {
+    pub fn create_buffer(&self, desc: &BufferDescriptor, device: &WgpuDevice) -> PooledGPUBuffer {
         self.pool.borrow_mut().get_or_create(desc, device)
     }
 
@@ -52,13 +52,13 @@ impl BufferAllocator {
         desc: &BufferDescriptor,
         contents: &[u8],
         device: &WgpuDevice,
-    ) -> GPUBuffer {
+    ) -> PooledGPUBuffer {
         let buf = self.pool.borrow_mut().get_or_create(desc, device);
         device.queue().write_buffer(&buf.inner, 0, contents);
         buf
     }
 
-    pub fn create_uniform_init(&self, uniform: CpuUniform, device: &WgpuDevice) -> GPUBuffer {
+    pub fn create_uniform_init(&self, uniform: CpuUniform, device: &WgpuDevice) -> PooledGPUBuffer {
         let mut uniform = uniform.into_inner();
         uniform.resize(
             uniform.len() + UNIFORM_ALIGN - uniform.len() % UNIFORM_ALIGN,
@@ -85,9 +85,9 @@ impl BufferAllocator {
     fn graph_allocate(
         &self,
         descriptor: BufferDescriptor,
-        free: &mut Vec<GPUBuffer>,
+        free: &mut Vec<PooledGPUBuffer>,
         device: &WgpuDevice,
-    ) -> GPUBuffer {
+    ) -> PooledGPUBuffer {
         let required_size = descriptor.size as _;
         let mut closest_index = None;
         let mut closest_size_diff: Option<usize> = None;
@@ -121,17 +121,16 @@ impl BufferAllocator {
         &self,
         execution_order: &[Tensor],
         device: &WgpuDevice,
-    ) -> Result<FxHashMap<TensorId, GPUBuffer>, DeviceError> {
+    ) -> Result<FxHashMap<TensorId, PooledGPUBuffer>, DeviceError> {
         let mut free = Vec::new(); //TODO: switch to BTreeMap
         let mut assignments = FxHashMap::default();
 
         for t in execution_order {
             if t.resolved() {
-                let storage_resource = t
-                    .storage()
-                    .try_read()
-                    .ok_or(AllocatorError::BufferNotFound)?;
-                assignments.insert(t.id(), storage_resource.try_gpu()?.inner.clone());
+                assignments.insert(
+                    t.id(),
+                    t.storage().as_ref().unwrap().try_gpu()?.inner.clone(),
+                );
                 continue;
             }
 
diff --git a/crates/ratchet-core/src/gpu/device.rs b/crates/ratchet-core/src/gpu/device.rs
index fefe081e..a5fa847c 100644
--- a/crates/ratchet-core/src/gpu/device.rs
+++ b/crates/ratchet-core/src/gpu/device.rs
@@ -5,7 +5,7 @@ use wgpu::{Adapter, DeviceType, Limits};
 
 use crate::DeviceError;
 
-use super::{BufferDescriptor, GPUBuffer, PoolError};
+use super::{BufferDescriptor, PoolError, PooledGPUBuffer};
 
 pub const MAX_BUFFER_SIZE: u64 = (2 << 29) - 1;
 
@@ -151,21 +151,21 @@ impl WgpuDevice {
         &self,
         desc: &BufferDescriptor,
         contents: &[u8],
-    ) -> Result<GPUBuffer, DeviceError> {
+    ) -> Result<PooledGPUBuffer, DeviceError> {
         Ok(self
             .buffer_allocator
             .create_buffer_init(desc, contents, self))
     }
 
-    pub fn create_uniform_init(&self, cpu_uniform: CpuUniform) -> GPUBuffer {
+    pub fn create_uniform_init(&self, cpu_uniform: CpuUniform) -> PooledGPUBuffer {
         self.buffer_allocator.create_uniform_init(cpu_uniform, self)
     }
 
-    pub fn allocate_buffer(&self, desc: &BufferDescriptor) -> Result<GPUBuffer, DeviceError> {
+    pub fn allocate_buffer(&self, desc: &BufferDescriptor) -> Result<PooledGPUBuffer, DeviceError> {
         Ok(self.buffer_allocator.create_buffer(desc, self))
     }
 
-    pub fn get_buffer(&self, handle: GpuBufferHandle) -> Result<GPUBuffer, DeviceError> {
+    pub fn get_buffer(&self, handle: GpuBufferHandle) -> Result<PooledGPUBuffer, DeviceError> {
         Ok(self.buffer_allocator.get(handle))
     }
 
@@ -221,7 +221,7 @@ impl WgpuDevice {
         &self,
         execution_order: &[Tensor],
         device: &WgpuDevice,
-    ) -> Result<FxHashMap<TensorId, GPUBuffer>, DeviceError> {
+    ) -> Result<FxHashMap<TensorId, PooledGPUBuffer>, DeviceError> {
         self.buffer_allocator.allocate_cfg(execution_order, device)
     }
 }
diff --git a/crates/ratchet-core/src/gpu/pools/bind_group_pool.rs b/crates/ratchet-core/src/gpu/pools/bind_group_pool.rs
index 94b1fa81..4b97e3e2 100644
--- a/crates/ratchet-core/src/gpu/pools/bind_group_pool.rs
+++ b/crates/ratchet-core/src/gpu/pools/bind_group_pool.rs
@@ -11,7 +11,7 @@ slotmap::new_key_type! { pub struct GpuBindGroupHandle; }
 #[derive(Clone)]
 pub struct GpuBindGroup {
     resource: Arc<DynamicResource<GpuBindGroupHandle, BindGroupDescriptor, wgpu::BindGroup>>,
-    _owned_buffers: RVec<GPUBuffer>,
+    _owned_buffers: RVec<PooledGPUBuffer>,
 }
 
 impl std::fmt::Debug for GpuBindGroup {
@@ -98,7 +98,7 @@ impl BindGroupPool {
     pub fn get_or_create(&self, desc: &BindGroupDescriptor, device: &WgpuDevice) -> GpuBindGroup {
         // Retrieve strong handles to buffers and textures.
         // This way, an owner of a bind group handle keeps buffers & textures alive!.
-        let owned_buffers: RVec<GPUBuffer> = {
+        let owned_buffers: RVec<PooledGPUBuffer> = {
             desc.entries
                 .iter()
                 .map(|e| device.get_buffer(e.handle).unwrap())
diff --git a/crates/ratchet-core/src/gpu/pools/buffer_pool.rs b/crates/ratchet-core/src/gpu/pools/buffer_pool.rs
index 4ce897d2..926df821 100644
--- a/crates/ratchet-core/src/gpu/pools/buffer_pool.rs
+++ b/crates/ratchet-core/src/gpu/pools/buffer_pool.rs
@@ -1,6 +1,6 @@
 // Adapted from https://github.com/rerun-io/rerun MIT licensed
 use super::{DynamicResource, DynamicResourcePool, DynamicResourcesDesc, PoolError};
-use crate::gpu::WgpuDevice;
+use crate::{gpu::WgpuDevice, RawGPUBuffer};
 
 #[derive(Clone, Hash, PartialEq, Eq, Debug, derive_new::new)]
 pub struct BufferDescriptor {
@@ -19,8 +19,8 @@ slotmap::new_key_type! { pub struct GpuBufferHandle; }
 
 /// A reference-counter baked buffer.
 /// Once all instances are dropped, the buffer will be marked for reclamation in the following pass.
-pub type GPUBuffer =
-    std::sync::Arc<DynamicResource<GpuBufferHandle, BufferDescriptor, wgpu::Buffer>>;
+pub type PooledGPUBuffer =
+    std::sync::Arc<DynamicResource<GpuBufferHandle, BufferDescriptor, RawGPUBuffer>>;
 
 impl DynamicResourcesDesc for BufferDescriptor {
     fn resource_size_in_bytes(&self) -> u64 {
@@ -37,7 +37,7 @@ impl DynamicResourcesDesc for BufferDescriptor {
 }
 
 pub struct BufferPool {
-    inner: DynamicResourcePool<GpuBufferHandle, BufferDescriptor, wgpu::Buffer>,
+    inner: DynamicResourcePool<GpuBufferHandle, BufferDescriptor, RawGPUBuffer>,
 }
 
 impl BufferPool {
@@ -47,7 +47,7 @@ impl BufferPool {
         }
     }
 
-    pub fn get_or_create(&self, desc: &BufferDescriptor, device: &WgpuDevice) -> GPUBuffer {
+    pub fn get_or_create(&self, desc: &BufferDescriptor, device: &WgpuDevice) -> PooledGPUBuffer {
         self.inner.get_or_create(desc, |desc| {
             let (size, usage, mapped_at_creation) = desc.fields();
             device.create_buffer(&wgpu::BufferDescriptor {
@@ -64,7 +64,7 @@ impl BufferPool {
     }
 
     /// Method to retrieve a resource from a weak handle (used by [`super::GpuBindGroupPool`])
-    pub fn get(&self, handle: GpuBufferHandle) -> Result<GPUBuffer, PoolError> {
+    pub fn get(&self, handle: GpuBufferHandle) -> Result<PooledGPUBuffer, PoolError> {
         self.inner.get_from_handle(handle)
     }
 
diff --git a/crates/ratchet-core/src/gpu/uniform.rs b/crates/ratchet-core/src/gpu/uniform.rs
index c2e733a0..dbe4b2c5 100644
--- a/crates/ratchet-core/src/gpu/uniform.rs
+++ b/crates/ratchet-core/src/gpu/uniform.rs
@@ -5,7 +5,7 @@ use crate::{
     rvec,
 };
 
-use super::{BindGroupDescriptor, GPUBuffer, GpuBindGroup, WgpuDevice};
+use super::{BindGroupDescriptor, GpuBindGroup, PooledGPUBuffer, WgpuDevice};
 use encase::DynamicUniformBuffer;
 
 ///We use a single uniform buffer for all operations to hold their parameters.
@@ -56,7 +56,7 @@ impl CpuUniform {
 }
 
 pub struct GpuUniform {
-    buf: GPUBuffer,
+    buf: PooledGPUBuffer,
     bind_group: GpuBindGroup,
 }
 
diff --git a/crates/ratchet-core/src/lib.rs b/crates/ratchet-core/src/lib.rs
index 6eecbfde..4ed8252c 100644
--- a/crates/ratchet-core/src/lib.rs
+++ b/crates/ratchet-core/src/lib.rs
@@ -33,6 +33,7 @@ pub use tensor_id::*;
 use smallvec::SmallVec;
 pub type RVec<T> = SmallVec<[T; 4]>;
 pub type DRVec<T> = SmallVec<[T; 8]>; //Double RVec
+pub type RawGPUBuffer = wgpu::Buffer;
 
 //https://github.com/sonos/tract/blob/main/data/src/macros.rs#L2
 #[macro_export]
diff --git a/crates/ratchet-core/src/storage/cpu_buffer.rs b/crates/ratchet-core/src/storage/cpu_buffer.rs
index b63147a7..1a4b3d21 100644
--- a/crates/ratchet-core/src/storage/cpu_buffer.rs
+++ b/crates/ratchet-core/src/storage/cpu_buffer.rs
@@ -1,26 +1,66 @@
 use bytemuck::NoUninit;
 
-use crate::{
-    storage::{DeviceStorage, RawGPUBuffer},
-    Device, DeviceError, Shape, TensorDType,
-};
+use crate::{storage::DeviceStorage, Device, DeviceError, GPUBuffer, Shape, TensorDType};
 
-use std::{alloc::Layout, fmt::Debug};
+use std::{alloc::Layout, fmt::Debug, sync::Arc};
 
 use crate::DType;
 
 #[derive(derive_new::new, Debug, PartialEq, Eq)]
 pub struct RawCPUBuffer(*mut u8, Layout);
 
-unsafe impl Send for RawCPUBuffer {}
-
 impl RawCPUBuffer {
+    pub fn into_raw_parts(&self) -> (*mut u8, Layout) {
+        (self.0, self.1)
+    }
+
+    pub fn n_bytes(&self) -> usize {
+        self.1.size()
+    }
+
+    pub fn as_bytes(&self) -> &[u8] {
+        unsafe { std::slice::from_raw_parts(self.0, self.1.size()) }
+    }
+}
+
+impl Clone for RawCPUBuffer {
+    fn clone(&self) -> Self {
+        let (ptr, layout) = self.into_raw_parts();
+        let alloc = unsafe { std::alloc::alloc(layout) };
+        unsafe { ptr.copy_to_nonoverlapping(alloc, layout.size()) };
+
+        Self(alloc, layout)
+    }
+}
+
+impl Drop for RawCPUBuffer {
+    fn drop(&mut self) {
+        if !self.0.is_null() && self.1.size() > 0 {
+            unsafe { std::alloc::dealloc(self.0, self.1) }
+        }
+    }
+}
+
+/// Managed CPU buffer
+#[derive(Debug, Clone, derive_new::new)]
+pub struct CPUBuffer {
+    inner: Arc<RawCPUBuffer>,
+}
+
+unsafe impl Send for CPUBuffer {}
+unsafe impl Sync for CPUBuffer {}
+
+impl CPUBuffer {
     pub fn from_slice<T: NoUninit>(data: &[T], shape: &Shape) -> Self {
         assert_eq!(data.len(), shape.numel());
         let bytes: &[u8] = bytemuck::cast_slice(data);
         Self::from_bytes(bytes, std::mem::align_of::<T>())
     }
 
+    pub fn inner(&self) -> &Arc<RawCPUBuffer> {
+        &self.inner
+    }
+
     unsafe fn uninitialized(size: usize, alignment: usize) -> Self {
         let layout = std::alloc::Layout::from_size_align(size, alignment).unwrap();
         let data = if size == 0 {
@@ -30,62 +70,56 @@ impl RawCPUBuffer {
             assert!(!ptr.is_null());
             ptr
         } as *mut u8;
-        Self(data, layout)
+        Self::from_raw_parts(data, layout)
     }
 
-    pub fn inner(&self) -> (*mut u8, Layout) {
-        (self.0, self.1)
-    }
-
-    pub fn as_bytes_mut(&mut self) -> &mut [u8] {
-        unsafe { std::slice::from_raw_parts_mut(self.0, self.1.size()) }
-    }
-
-    pub fn as_bytes(&self) -> &[u8] {
-        unsafe { std::slice::from_raw_parts(self.0, self.1.size()) }
+    pub fn from_raw_parts(data: *mut u8, layout: Layout) -> Self {
+        Self {
+            inner: Arc::new(RawCPUBuffer(data, layout)),
+        }
     }
 
     pub fn from_bytes(bytes: &[u8], alignment: usize) -> Self {
-        let mut storage = unsafe { Self::uninitialized(bytes.len(), alignment) };
-        storage.as_bytes_mut().copy_from_slice(bytes);
-        storage
+        let layout = std::alloc::Layout::from_size_align(bytes.len(), alignment).unwrap();
+        let data = if bytes.len() == 0 {
+            std::ptr::null()
+        } else {
+            let ptr = unsafe { std::alloc::alloc(layout) };
+            assert!(!ptr.is_null());
+            unsafe { ptr.copy_from_nonoverlapping(bytes.as_ptr(), bytes.len()) };
+            ptr
+        } as *mut u8;
+        Self::from_raw_parts(data, layout)
     }
-}
 
-impl Clone for RawCPUBuffer {
-    fn clone(&self) -> Self {
-        let (ptr, layout) = self.inner();
+    pub fn deep_clone(&self) -> Self {
+        let (ptr, layout) = self.inner().into_raw_parts();
         let alloc = unsafe { std::alloc::alloc(layout) };
         unsafe { ptr.copy_to_nonoverlapping(alloc, layout.size()) };
 
-        Self(alloc, layout)
+        Self::from_raw_parts(alloc, layout)
     }
 }
 
-impl Drop for RawCPUBuffer {
-    fn drop(&mut self) {
-        if !self.0.is_null() && self.1.size() > 0 {
-            unsafe { std::alloc::dealloc(self.0, self.1) }
-        }
+impl DeviceStorage for CPUBuffer {
+    fn to_device(&self, device: &Device) -> Result<GPUBuffer, DeviceError> {
+        let gpu_device = device.try_gpu()?;
+        let raw = self.inner();
+        let (ptr, layout) = raw.into_raw_parts();
+        let bytes = unsafe { std::slice::from_raw_parts(ptr, layout.size()) };
+        Ok(GPUBuffer::from_bytes(bytes, layout.align(), gpu_device))
     }
-}
 
-impl DeviceStorage for RawCPUBuffer {
-    fn to_device(self, device: &Device) -> Result<RawGPUBuffer, DeviceError> {
-        let (bytes, align, gpu_device) = (self.as_bytes(), self.1.align(), device.try_gpu()?);
-        Ok(RawGPUBuffer::from_bytes(bytes, align, gpu_device))
-    }
-
-    fn to_cpu(&self, _device: &Device) -> Result<RawCPUBuffer, DeviceError> {
+    fn to_cpu(&self, _device: &Device) -> Result<CPUBuffer, DeviceError> {
         Ok(self.clone())
     }
 
     fn n_bytes(&self) -> usize {
-        self.1.size()
+        self.inner().n_bytes()
     }
 
     fn dump(&self, dtype: DType, full: bool) -> String {
-        let bytes = unsafe { std::slice::from_raw_parts(self.0, self.1.size()) };
+        let bytes = self.inner().as_bytes();
 
         fn dump_inner<T: TensorDType>(data: &[T], full: bool) -> String {
             let length = if data.len() < 64 { data.len() } else { 64 };
diff --git a/crates/ratchet-core/src/storage/gpu_buffer.rs b/crates/ratchet-core/src/storage/gpu_buffer.rs
index a5baedc7..c42309c8 100644
--- a/crates/ratchet-core/src/storage/gpu_buffer.rs
+++ b/crates/ratchet-core/src/storage/gpu_buffer.rs
@@ -1,8 +1,8 @@
 use crate::{
     gpu::{BufferDescriptor, WgpuDevice},
-    gpu::{BufferUsagesExt, GPUBuffer},
-    storage::{DeviceStorage, RawCPUBuffer},
-    Device, DeviceError, Shape, TensorError,
+    gpu::{BufferUsagesExt, PooledGPUBuffer},
+    storage::{CPUBuffer, DeviceStorage},
+    Device, DeviceError, Shape,
 };
 
 use bytemuck::NoUninit;
@@ -10,13 +10,13 @@ use wgpu::BufferUsages;
 
 use crate::DType;
 
-#[derive(Clone, derive_new::new)]
-pub struct RawGPUBuffer {
-    pub(crate) inner: GPUBuffer,
+#[derive(Clone, Debug, derive_new::new)]
+pub struct GPUBuffer {
+    pub(crate) inner: PooledGPUBuffer,
     pub(crate) alignment: usize,
 }
 
-impl RawGPUBuffer {
+impl GPUBuffer {
     const MIN_SIZE: usize = 16;
 
     pub fn from_slice<T: NoUninit>(data: &[T], shape: &Shape, device: &WgpuDevice) -> Self {
@@ -37,7 +37,7 @@ impl RawGPUBuffer {
         } else {
             bytes
         };
-        let buffer = device
+        let inner = device
             .create_buffer_init(
                 &BufferDescriptor::new(bytes.len() as _, BufferUsages::standard(), false),
                 bytes,
@@ -45,10 +45,7 @@ impl RawGPUBuffer {
             .unwrap();
         device.queue().submit(None);
         device.poll(wgpu::Maintain::Wait);
-        Self {
-            inner: buffer,
-            alignment,
-        }
+        Self { inner, alignment }
     }
 
     /// Returns true if the buffer has all the given usages.
@@ -59,7 +56,7 @@ impl RawGPUBuffer {
         }
     }
 
-    pub fn inner(&self) -> &GPUBuffer {
+    pub fn inner(&self) -> &PooledGPUBuffer {
         &self.inner
     }
 
@@ -68,26 +65,12 @@ impl RawGPUBuffer {
     }
 }
 
-impl std::fmt::Debug for RawGPUBuffer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("RawGPUBuffer")
-            .field("buf", &self.inner.global_id())
-            .finish()
-    }
-}
-
-impl PartialEq for RawGPUBuffer {
-    fn eq(&self, other: &Self) -> bool {
-        self.inner.global_id() == other.inner.global_id()
-    }
-}
-
-impl DeviceStorage for RawGPUBuffer {
-    fn to_device(self, _: &Device) -> Result<RawGPUBuffer, DeviceError> {
-        Ok(self)
+impl DeviceStorage for GPUBuffer {
+    fn to_device(&self, _: &Device) -> Result<GPUBuffer, DeviceError> {
+        Ok(self.clone())
     }
 
-    fn to_cpu(&self, device: &Device) -> Result<RawCPUBuffer, DeviceError> {
+    fn to_cpu(&self, device: &Device) -> Result<CPUBuffer, DeviceError> {
         self.validate_usages(BufferUsages::COPY_SRC)?;
         let device = device.try_gpu()?;
         let buffer_slice = self.inner.slice(..);
@@ -100,7 +83,7 @@ impl DeviceStorage for RawGPUBuffer {
             &buffer_slice,
             move |buffer| {
                 tx.send(match buffer {
-                    Ok(db) => Ok(RawCPUBuffer::from_bytes(&db, alignment)),
+                    Ok(db) => Ok(CPUBuffer::from_bytes(&db, alignment)),
                     Err(error) => Err(error),
                 })
                 .expect("Failed to send result of read_buffer");
diff --git a/crates/ratchet-core/src/storage/mod.rs b/crates/ratchet-core/src/storage/mod.rs
index f998dc16..d83cbeac 100644
--- a/crates/ratchet-core/src/storage/mod.rs
+++ b/crates/ratchet-core/src/storage/mod.rs
@@ -5,116 +5,58 @@ use bytemuck::NoUninit;
 pub use cpu_buffer::*;
 pub use gpu_buffer::*;
 
-use crate::{gpu::GPUBuffer, Device, DeviceError, Shape};
+use crate::{Device, DeviceError, Shape};
 
 use crate::DType;
 
 #[derive(Debug)]
-pub struct Storage {
-    raw: Option<RawStorage>, //Optional as the tensor may not be resolved
+pub enum Storage {
+    CPU(CPUBuffer),
+    GPU(GPUBuffer),
 }
 
-unsafe impl Send for Storage {}
-unsafe impl Sync for Storage {}
-
 impl Storage {
-    pub fn empty() -> Self {
-        Self { raw: None }
-    }
-
     pub fn from_slice<T: NoUninit>(data: &[T], shape: &Shape, device: &Device) -> Self {
-        assert_eq!(data.len(), shape.numel());
         match device {
-            Device::CPU => Self {
-                raw: Some(RawStorage::CPU(RawCPUBuffer::from_slice(data, shape))),
-            },
-            Device::GPU(d) => Self {
-                raw: Some(RawStorage::GPU(RawGPUBuffer::from_slice(data, shape, d))),
-            },
+            Device::CPU => Storage::CPU(CPUBuffer::from_slice(data, shape)),
+            Device::GPU(g) => Storage::GPU(GPUBuffer::from_slice(data, shape, g)),
         }
     }
 
-    pub fn set_raw(&mut self, raw: RawStorage) {
-        self.raw = Some(raw);
-    }
-
-    pub fn raw(&self) -> Option<&RawStorage> {
-        self.raw.as_ref()
-    }
-
-    pub fn try_gpu(&self) -> Result<&RawGPUBuffer, DeviceError> {
-        match self.raw.as_ref() {
-            Some(RawStorage::GPU(raw)) => Ok(raw),
-            _ => Err(DeviceError::DeviceMismatch(
-                "GPU".to_string(),
-                "CPU".to_string(),
-            )),
+    pub fn dump(&self, dt: DType, full: bool) -> String {
+        match self {
+            Storage::CPU(c) => c.dump(dt, full),
+            Storage::GPU(g) => g.dump(dt, full),
         }
     }
 
-    pub fn try_cpu(&self) -> Result<&RawCPUBuffer, DeviceError> {
-        match self.raw.as_ref() {
-            Some(RawStorage::CPU(raw)) => Ok(raw),
-            _ => Err(DeviceError::DeviceMismatch(
-                "CPU".to_string(),
-                "GPU".to_string(),
-            )),
+    pub fn try_cpu(&self) -> Result<&CPUBuffer, DeviceError> {
+        match self {
+            Storage::CPU(c) => Ok(c),
+            _ => unimplemented!(),
         }
     }
 
-    pub fn dump(&self, dtype: DType, full: bool) -> String {
-        self.raw
-            .as_ref()
-            .map(|raw| match raw {
-                RawStorage::CPU(raw) => raw.dump(dtype, full),
-                RawStorage::GPU(raw) => raw.dump(dtype, full),
-            })
-            .unwrap_or_else(|| "None".to_string())
-    }
-}
-
-impl From<RawStorage> for Storage {
-    fn from(raw: RawStorage) -> Self {
-        Self { raw: Some(raw) }
-    }
-}
-
-impl From<RawCPUBuffer> for Storage {
-    fn from(raw: RawCPUBuffer) -> Self {
-        Self {
-            raw: Some(RawStorage::CPU(raw)),
+    pub fn try_gpu(&self) -> Result<&GPUBuffer, DeviceError> {
+        match self {
+            Storage::GPU(g) => Ok(g),
+            _ => unimplemented!(),
         }
     }
-}
 
-impl From<RawGPUBuffer> for Storage {
-    fn from(raw: RawGPUBuffer) -> Self {
-        Self {
-            raw: Some(RawStorage::GPU(raw)),
+    pub fn deep_clone(&self, _: &Device) -> Result<Self, DeviceError> {
+        match self {
+            Storage::CPU(buf) => Ok(Storage::CPU(buf.deep_clone())),
+            _ => todo!(),
         }
     }
 }
 
-#[derive(Debug)]
-pub enum RawStorage {
-    CPU(RawCPUBuffer),
-    GPU(RawGPUBuffer),
-}
-
-impl RawStorage {
-    pub fn from_gpu(buf: GPUBuffer, dtype: DType) -> Self {
-        RawStorage::GPU(RawGPUBuffer {
-            inner: buf,
-            alignment: dtype.size_of(),
-        })
-    }
-}
-
 pub trait DeviceStorage: std::fmt::Debug + Clone + 'static {
     // To be expanded to other devices
-    fn to_device(self, device: &Device) -> Result<RawGPUBuffer, DeviceError>;
+    fn to_device(&self, device: &Device) -> Result<GPUBuffer, DeviceError>;
     /// Creates a copy of the device buffer on the CPU
-    fn to_cpu(&self, device: &Device) -> Result<RawCPUBuffer, DeviceError>;
+    fn to_cpu(&self, device: &Device) -> Result<CPUBuffer, DeviceError>;
     fn n_bytes(&self) -> usize;
     fn dump(&self, dt: DType, full: bool) -> String;
 }
diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index 683e5dce..478c66b2 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -1,12 +1,12 @@
 use crate::gpu::{CpuUniform, WgpuDevice};
 use crate::{
-    ops::*, CompiledOp, DType, Device, DeviceStorage, Executable, Operation, OperationError,
-    RawCPUBuffer, RawStorage, Shape, Storage, Strides, TensorDType, TensorId,
+    ops::*, CPUBuffer, CompiledOp, DType, Device, DeviceStorage, Executable, GPUBuffer, Operation,
+    OperationError, Shape, Storage, Strides, TensorDType, TensorId,
 };
 use crate::{BinaryOp, LazyOp};
 
 use derive_new::new;
-use parking_lot::RwLock;
+use parking_lot::{RwLock, RwLockReadGuard};
 use std::sync::Arc;
 
 #[cfg(feature = "rand")]
@@ -43,21 +43,24 @@ pub struct Tensor {
 }
 
 impl Tensor {
-    fn new(op: LazyOp, meta: StorageView, storage: Storage, device: Device) -> Self {
+    fn new(op: LazyOp, meta: StorageView, storage: Option<Storage>, device: Device) -> Self {
         Self {
             inner: Arc::new(Inner::new(op, meta, storage, device)),
         }
     }
 
     fn lazy(op: LazyOp, meta: StorageView, device: Device) -> Self {
-        Self::new(op, meta, Storage::empty(), device)
+        Self::new(op, meta, None, device)
+    }
+
+    fn update_storage(&self, storage: Storage) {
+        *self.inner.storage.write() = Some(storage);
     }
 }
 
 impl std::fmt::Debug for Tensor {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let storage = self.storage().try_read().expect("Could not read storage");
-        let storage_fmt = storage.dump(self.dt(), false);
+        let storage_fmt = self.storage().as_ref().map(|s| s.dump(self.dt(), false));
         let (id, op) = (self.id(), self.op());
         f.debug_struct("Tensor")
             .field("id", &id)
@@ -94,7 +97,7 @@ pub struct Inner {
     op: LazyOp,
     device: Device,
     view: StorageView,
-    storage: Arc<RwLock<Storage>>,
+    storage: Arc<RwLock<Option<Storage>>>,
 }
 
 impl AsRef<Inner> for Inner {
@@ -104,7 +107,7 @@ impl AsRef<Inner> for Inner {
 }
 
 impl Inner {
-    fn new(op: LazyOp, meta: StorageView, storage: Storage, device: Device) -> Self {
+    fn new(op: LazyOp, meta: StorageView, storage: Option<Storage>, device: Device) -> Self {
         Self {
             id: TensorId::new(),
             view: meta,
@@ -144,19 +147,12 @@ impl Tensor {
         &self.device
     }
 
-    pub fn storage(&self) -> &Arc<RwLock<Storage>> {
-        &self.storage
+    pub fn storage(&self) -> RwLockReadGuard<Option<Storage>> {
+        self.inner.storage.read()
     }
 
     pub fn resolved(&self) -> bool {
-        self.storage().try_read().unwrap().raw().is_some()
-    }
-
-    /// # Safety
-    ///
-    /// Make sure your device & storage are compatible.
-    pub(crate) unsafe fn set_storage(&self, storage: Storage) {
-        *self.storage().write() = storage;
+        self.storage().is_some()
     }
 
     pub(crate) fn op(&self) -> &LazyOp {
@@ -214,7 +210,7 @@ impl Tensor {
         let storage = Storage::from_slice(data.as_ref(), &shape, &device);
         let strides = Strides::from(&shape);
         let meta = StorageView::new(shape, T::dt(), strides);
-        Tensor::new(LazyOp::Const, meta, storage, device)
+        Tensor::new(LazyOp::Const, meta, Some(storage), device)
     }
 
     fn execution_order(&self) -> Vec<Tensor> {
@@ -264,11 +260,14 @@ impl Tensor {
         for t in execution_order {
             if !t.resolved() {
                 let id = t.id();
-                let gpu_buf = allocations.get(&id).ok_or(TensorError::NoStorage(id))?;
+                let pooled_buffer = allocations.get(&id).ok_or(TensorError::NoStorage(id))?;
                 assert!(t.device().is_gpu());
-                unsafe {
-                    t.set_storage(Storage::from(RawStorage::from_gpu(gpu_buf.clone(), t.dt())));
-                }
+
+                let storage = Storage::GPU(GPUBuffer {
+                    inner: pooled_buffer.clone(),
+                    alignment: t.dt().size_of(),
+                });
+                t.update_storage(storage);
             }
 
             if let Some(compiled_op) = t.compile(&mut uniform, device) {
@@ -281,36 +280,69 @@ impl Tensor {
         Ok(())
     }
 
-    async fn to_cpu(&self) -> Result<Tensor, TensorError> {
-        let raw_gpu_buf = {
-            let storage_resource = self.storage().try_read().ok_or(TensorError::NotResolved)?;
-            storage_resource.try_gpu()?.clone()
+    fn to_cpu(&self) -> Result<Tensor, TensorError> {
+        if self.device().is_cpu() || !self.resolved() {
+            return Ok(self.clone());
+        }
+        let storage_guard = self.storage();
+        let storage = storage_guard.as_ref().unwrap();
+        let gpu_buf = match storage {
+            Storage::GPU(g) => g,
+            _ => unreachable!(),
         };
+        let cpu_buf = gpu_buf.to_cpu(&self.device)?;
+
         Ok(Tensor::new(
             LazyOp::Const,
             self.view.clone(),
-            Storage::from(raw_gpu_buf.to_cpu(self.device())?),
+            Some(Storage::CPU(cpu_buf)),
             Device::CPU,
         ))
     }
 
+    fn to_gpu(&self, dst_device: &Device) -> Result<Tensor, TensorError> {
+        if self.device().is_gpu() || !self.resolved() {
+            return Ok(self.clone());
+        }
+        let storage_guard = self.storage();
+        let storage = storage_guard.as_ref().unwrap();
+        let cpu_buf = match storage {
+            Storage::CPU(g) => g,
+            _ => unreachable!(),
+        };
+        let gpu_buf = cpu_buf.to_device(dst_device)?;
+
+        let wgpu_device = dst_device.try_gpu()?;
+        Ok(Tensor::new(
+            LazyOp::Const,
+            self.view.clone(),
+            Some(Storage::GPU(gpu_buf)),
+            Device::GPU(wgpu_device.clone()),
+        ))
+    }
+
+    /// Transfers the tensor to the specified device.
+    ///
+    /// If the tensor is already on the specified device, it will be returned as-is,
+    /// and the underlying storage will not be copied.
+    /// If the tensor is on a different device, it will be copied to the specified device.
     pub fn to(&self, device: Device) -> Result<Tensor, TensorError> {
-        match (self.device(), device) {
-            (Device::GPU(_), Device::CPU) => pollster::block_on(self.to_cpu()),
-            (Device::CPU, Device::GPU(_)) => todo!(),
+        match (self.device(), &device) {
+            (Device::GPU(_), Device::CPU) => self.to_cpu(),
+            (Device::CPU, Device::GPU(_)) => self.to_gpu(&device),
             _ => Ok(self.clone()),
         }
     }
 
     #[cfg(feature = "pyo3")]
-    pub fn into_ndarray<T: TensorDType>(&self) -> ArrayD<T> {
+    pub fn into_ndarray<T: TensorDType>(self) -> ArrayD<T> {
         assert!(self.device().is_cpu());
-        let storage = self.storage().try_read().unwrap();
-        let raw_cpu = storage.try_cpu().unwrap();
         let shape = self.shape().to_vec();
         if self.num_bytes() != 0 {
-            let ptr = raw_cpu.inner().0 as *const T;
-            unsafe { ArrayViewD::from_shape_ptr(shape, ptr).to_owned() }
+            let storage_guard = self.storage();
+            let buffer = storage_guard.as_ref().unwrap().try_cpu().unwrap();
+            let (ptr, _) = buffer.inner().into_raw_parts();
+            unsafe { ArrayViewD::from_shape_ptr(shape, ptr as *const T).to_owned() }
         } else {
             ArrayViewD::from_shape(shape, &[]).unwrap().to_owned()
         }
@@ -322,7 +354,19 @@ impl Tensor {
         py: &'p pyo3::Python<'p>,
     ) -> &PyArrayDyn<T> {
         use numpy::PyArray;
-        PyArray::from_owned_array(*py, self.clone().into_ndarray::<T>())
+        PyArray::from_owned_array(*py, self.deep_clone().into_ndarray::<T>())
+    }
+
+    pub fn deep_clone(&self) -> Tensor {
+        let storage_guard = self.storage();
+        let storage = storage_guard.as_ref().unwrap();
+        let cloned_storage = storage.deep_clone(self.device()).unwrap();
+        Tensor::new(
+            LazyOp::Const,
+            self.view.clone(),
+            Some(cloned_storage),
+            self.device.clone(),
+        )
     }
 }
 
@@ -338,13 +382,16 @@ impl<T: TensorDType> From<ArrayD<T>> for Tensor {
             let shape = it.shape().to_vec().into();
             let strides = Strides::from(&shape);
             let vec = it.into_raw_vec().into_boxed_slice();
-            //This is causing a double free
             let ptr = Box::into_raw(vec) as *mut u8;
 
-            let raw_buf = RawCPUBuffer::new(ptr, layout);
-            let storage = Storage::from(RawStorage::CPU(raw_buf));
+            let cpu_buf = CPUBuffer::from_raw_parts(ptr, layout);
             let meta = StorageView::new(shape, T::dt(), strides);
-            Tensor::new(LazyOp::Const, meta, storage, Device::CPU)
+            Tensor::new(
+                LazyOp::Const,
+                meta,
+                Some(Storage::CPU(cpu_buf)),
+                Device::CPU,
+            )
         } else {
             panic!("Cannot convert numpy array with non-contiguous memory layout to tensor");
         }
@@ -383,10 +430,20 @@ mod tests {
 
     #[test]
     fn test_pyo3() -> anyhow::Result<()> {
-        let device = Device::request_device(DeviceRequest::CPU)?;
-        let a = Tensor::randn::<f32>(shape![1024, 1024], device.clone());
-        let b = Tensor::randn::<f32>(shape![1024, 1024], device.clone());
-        let c: anyhow::Result<Tensor> = Python::with_gil(|py| {
+        let cpu_device = Device::request_device(DeviceRequest::CPU)?;
+        let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+        let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+
+        let gpu_device = Device::request_device(DeviceRequest::GPU)?;
+        let a = a.to(gpu_device.clone())?;
+        let b = b.to(gpu_device)?;
+
+        let c = a.matmul(&b)?;
+        c.resolve()?;
+
+        let our_result = c.to(cpu_device)?;
+
+        let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
             let prg = PyModule::from_code(
                 py,
                 r#"
@@ -405,7 +462,8 @@ def matmul(a, b):
                 .extract::<&PyArrayDyn<f32>>()?;
             Ok(Tensor::from(result))
         });
-        println!("\nTORCH: {:#?}", c);
+        println!("\nTORCH: {:#?}", ground);
+        println!("\nOURS: {:#?}", our_result);
 
         Ok(())
     }

From cd31192df8d7c9650bb6e8897bb00f5979f2d1fc Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Sun, 21 Jan 2024 16:49:01 +0000
Subject: [PATCH 08/37] chore: not terrible

---
 .../ratchet-core/src/gpu/buffer_allocator.rs  |  2 +-
 crates/ratchet-core/src/gpu/device.rs         |  7 ++--
 crates/ratchet-core/src/storage/gpu_buffer.rs | 22 +++++++++++-
 crates/ratchet-core/src/storage/mod.rs        |  7 ++--
 crates/ratchet-core/src/tensor.rs             | 36 +++++++++----------
 5 files changed, 49 insertions(+), 25 deletions(-)

diff --git a/crates/ratchet-core/src/gpu/buffer_allocator.rs b/crates/ratchet-core/src/gpu/buffer_allocator.rs
index e6017fb2..a6af07c4 100644
--- a/crates/ratchet-core/src/gpu/buffer_allocator.rs
+++ b/crates/ratchet-core/src/gpu/buffer_allocator.rs
@@ -158,7 +158,7 @@ impl BufferAllocator {
         let output = execution_order.last().unwrap();
         assignments.insert(
             output.id(),
-            device.allocate_buffer(&BufferDescriptor {
+            device.get_or_create_buffer(&BufferDescriptor {
                 size: output.num_bytes() as _,
                 usage: BufferUsages::standard(),
                 mapped_at_creation: false,
diff --git a/crates/ratchet-core/src/gpu/device.rs b/crates/ratchet-core/src/gpu/device.rs
index a5fa847c..04bfb0f6 100644
--- a/crates/ratchet-core/src/gpu/device.rs
+++ b/crates/ratchet-core/src/gpu/device.rs
@@ -147,7 +147,7 @@ impl WgpuDevice {
 }
 
 impl WgpuDevice {
-    pub fn create_buffer_init(
+    pub fn get_or_create_buffer_init(
         &self,
         desc: &BufferDescriptor,
         contents: &[u8],
@@ -161,7 +161,10 @@ impl WgpuDevice {
         self.buffer_allocator.create_uniform_init(cpu_uniform, self)
     }
 
-    pub fn allocate_buffer(&self, desc: &BufferDescriptor) -> Result<PooledGPUBuffer, DeviceError> {
+    pub fn get_or_create_buffer(
+        &self,
+        desc: &BufferDescriptor,
+    ) -> Result<PooledGPUBuffer, DeviceError> {
         Ok(self.buffer_allocator.create_buffer(desc, self))
     }
 
diff --git a/crates/ratchet-core/src/storage/gpu_buffer.rs b/crates/ratchet-core/src/storage/gpu_buffer.rs
index c42309c8..cd268568 100644
--- a/crates/ratchet-core/src/storage/gpu_buffer.rs
+++ b/crates/ratchet-core/src/storage/gpu_buffer.rs
@@ -38,7 +38,7 @@ impl GPUBuffer {
             bytes
         };
         let inner = device
-            .create_buffer_init(
+            .get_or_create_buffer_init(
                 &BufferDescriptor::new(bytes.len() as _, BufferUsages::standard(), false),
                 bytes,
             )
@@ -63,6 +63,26 @@ impl GPUBuffer {
     pub fn usage(&self) -> BufferUsages {
         self.inner.usage()
     }
+
+    pub fn deep_clone(&self, device: &WgpuDevice) -> Self {
+        //Here we need to create a buffer just like ours
+        let clone = device
+            .get_or_create_buffer(&BufferDescriptor::new(
+                self.inner.size(),
+                self.inner.usage(),
+                false,
+            ))
+            .unwrap();
+        let mut encoder =
+            device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
+        encoder.copy_buffer_to_buffer(&self.inner, 0, &clone, 0, self.inner.size());
+        device.queue().submit(Some(encoder.finish()));
+        device.poll(wgpu::Maintain::Wait);
+        Self {
+            inner: clone,
+            alignment: self.alignment,
+        }
+    }
 }
 
 impl DeviceStorage for GPUBuffer {
diff --git a/crates/ratchet-core/src/storage/mod.rs b/crates/ratchet-core/src/storage/mod.rs
index d83cbeac..921a6c20 100644
--- a/crates/ratchet-core/src/storage/mod.rs
+++ b/crates/ratchet-core/src/storage/mod.rs
@@ -44,10 +44,13 @@ impl Storage {
         }
     }
 
-    pub fn deep_clone(&self, _: &Device) -> Result<Self, DeviceError> {
+    pub fn deep_clone(&self, device: &Device) -> Result<Self, DeviceError> {
         match self {
             Storage::CPU(buf) => Ok(Storage::CPU(buf.deep_clone())),
-            _ => todo!(),
+            Storage::GPU(buf) => {
+                let gpu_device = device.try_gpu()?;
+                Ok(Storage::GPU(buf.deep_clone(gpu_device)))
+            }
         }
     }
 }
diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index 478c66b2..12db2744 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -285,11 +285,10 @@ impl Tensor {
             return Ok(self.clone());
         }
         let storage_guard = self.storage();
-        let storage = storage_guard.as_ref().unwrap();
-        let gpu_buf = match storage {
-            Storage::GPU(g) => g,
-            _ => unreachable!(),
-        };
+        let gpu_buf = storage_guard
+            .as_ref()
+            .ok_or(TensorError::TransferError)?
+            .try_gpu()?;
         let cpu_buf = gpu_buf.to_cpu(&self.device)?;
 
         Ok(Tensor::new(
@@ -305,11 +304,10 @@ impl Tensor {
             return Ok(self.clone());
         }
         let storage_guard = self.storage();
-        let storage = storage_guard.as_ref().unwrap();
-        let cpu_buf = match storage {
-            Storage::CPU(g) => g,
-            _ => unreachable!(),
-        };
+        let cpu_buf = storage_guard
+            .as_ref()
+            .ok_or(TensorError::TransferError)?
+            .try_cpu()?;
         let gpu_buf = cpu_buf.to_device(dst_device)?;
 
         let wgpu_device = dst_device.try_gpu()?;
@@ -434,15 +432,6 @@ mod tests {
         let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
         let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
 
-        let gpu_device = Device::request_device(DeviceRequest::GPU)?;
-        let a = a.to(gpu_device.clone())?;
-        let b = b.to(gpu_device)?;
-
-        let c = a.matmul(&b)?;
-        c.resolve()?;
-
-        let our_result = c.to(cpu_device)?;
-
         let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
             let prg = PyModule::from_code(
                 py,
@@ -462,6 +451,15 @@ def matmul(a, b):
                 .extract::<&PyArrayDyn<f32>>()?;
             Ok(Tensor::from(result))
         });
+
+        let gpu_device = Device::request_device(DeviceRequest::GPU)?;
+        let a = a.to(gpu_device.clone())?;
+        let b = b.to(gpu_device)?;
+
+        let c = a.matmul(&b)?;
+        c.resolve()?;
+
+        let our_result = c.to(cpu_device)?;
         println!("\nTORCH: {:#?}", ground);
         println!("\nOURS: {:#?}", our_result);
 

From 1859f9e902e388b933b17e9f527839277663a328 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Sun, 21 Jan 2024 17:24:50 +0000
Subject: [PATCH 09/37] chore: check test

---
 crates/ratchet-core/src/tensor.rs | 80 ++++++++++++++++---------------
 1 file changed, 41 insertions(+), 39 deletions(-)

diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index 12db2744..3d3ca2d3 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -426,43 +426,45 @@ mod tests {
         Ok(())
     }
 
-    #[test]
-    fn test_pyo3() -> anyhow::Result<()> {
-        let cpu_device = Device::request_device(DeviceRequest::CPU)?;
-        let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
-        let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
-
-        let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
-            let prg = PyModule::from_code(
-                py,
-                r#"
-import torch
-
-def matmul(a, b):
-    return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
-"#,
-                "x.py",
-                "x",
-            )?;
-
-            let result = prg
-                .getattr("matmul")?
-                .call1((a.to_py::<f32>(&py), b.to_py::<f32>(&py)))?
-                .extract::<&PyArrayDyn<f32>>()?;
-            Ok(Tensor::from(result))
-        });
-
-        let gpu_device = Device::request_device(DeviceRequest::GPU)?;
-        let a = a.to(gpu_device.clone())?;
-        let b = b.to(gpu_device)?;
-
-        let c = a.matmul(&b)?;
-        c.resolve()?;
-
-        let our_result = c.to(cpu_device)?;
-        println!("\nTORCH: {:#?}", ground);
-        println!("\nOURS: {:#?}", our_result);
-
-        Ok(())
-    }
+    /*
+        #[test]
+        fn test_pyo3() -> anyhow::Result<()> {
+            let cpu_device = Device::request_device(DeviceRequest::CPU)?;
+            let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+            let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+
+            let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
+                let prg = PyModule::from_code(
+                    py,
+                    r#"
+    import torch
+
+    def matmul(a, b):
+        return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
+    "#,
+                    "x.py",
+                    "x",
+                )?;
+
+                let result = prg
+                    .getattr("matmul")?
+                    .call1((a.to_py::<f32>(&py), b.to_py::<f32>(&py)))?
+                    .extract::<&PyArrayDyn<f32>>()?;
+                Ok(Tensor::from(result))
+            });
+
+            let gpu_device = Device::request_device(DeviceRequest::GPU)?;
+            let a = a.to(gpu_device.clone())?;
+            let b = b.to(gpu_device)?;
+
+            let c = a.matmul(&b)?;
+            c.resolve()?;
+
+            let our_result = c.to(cpu_device)?;
+            println!("\nTORCH: {:#?}", ground);
+            println!("\nOURS: {:#?}", our_result);
+
+            Ok(())
+        }
+        */
 }

From 21cbe5b2f8ee64785c962741fca9f0679aa6fd70 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Sun, 21 Jan 2024 17:39:13 +0000
Subject: [PATCH 10/37] chore: still freeing somewhere

---
 .github/workflows/rust.yml                    |  2 +-
 crates/ratchet-core/src/storage/cpu_buffer.rs |  2 +
 crates/ratchet-core/src/tensor.rs             | 72 ++++++++++---------
 3 files changed, 40 insertions(+), 36 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 7c62f447..3f36e9f6 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -46,6 +46,6 @@ jobs:
       - name: Build
         run: cargo build
       - name: Run tests
-        run: cargo test
+        run: cargo test -- --nocapture
       - name: Run integration tests
         run: (cd crates/ratchet-integration-tests;sh run-tests.sh)
diff --git a/crates/ratchet-core/src/storage/cpu_buffer.rs b/crates/ratchet-core/src/storage/cpu_buffer.rs
index 1a4b3d21..5feecce7 100644
--- a/crates/ratchet-core/src/storage/cpu_buffer.rs
+++ b/crates/ratchet-core/src/storage/cpu_buffer.rs
@@ -94,8 +94,10 @@ impl CPUBuffer {
 
     pub fn deep_clone(&self) -> Self {
         let (ptr, layout) = self.inner().into_raw_parts();
+        println!("before deep clone: {:p}", ptr);
         let alloc = unsafe { std::alloc::alloc(layout) };
         unsafe { ptr.copy_to_nonoverlapping(alloc, layout.size()) };
+        println!("after deep clone: {:p}", alloc);
 
         Self::from_raw_parts(alloc, layout)
     }
diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index 3d3ca2d3..40cb2978 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -340,6 +340,7 @@ impl Tensor {
             let storage_guard = self.storage();
             let buffer = storage_guard.as_ref().unwrap().try_cpu().unwrap();
             let (ptr, _) = buffer.inner().into_raw_parts();
+            println!("INTO NDARRAY: {:?}", ptr);
             unsafe { ArrayViewD::from_shape_ptr(shape, ptr as *const T).to_owned() }
         } else {
             ArrayViewD::from_shape(shape, &[]).unwrap().to_owned()
@@ -426,45 +427,46 @@ mod tests {
         Ok(())
     }
 
-    /*
-        #[test]
-        fn test_pyo3() -> anyhow::Result<()> {
-            let cpu_device = Device::request_device(DeviceRequest::CPU)?;
-            let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
-            let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
-
-            let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
-                let prg = PyModule::from_code(
-                    py,
-                    r#"
-    import torch
-
-    def matmul(a, b):
-        return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
+    #[test]
+    fn test_pyo3() -> anyhow::Result<()> {
+        let cpu_device = Device::request_device(DeviceRequest::CPU)?;
+        let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+        let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+
+        let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
+            let prg = PyModule::from_code(
+                py,
+                r#"
+import torch
+
+def matmul(a, b):
+    return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
     "#,
-                    "x.py",
-                    "x",
-                )?;
+                "x.py",
+                "x",
+            )?;
+
+            let result = prg
+                .getattr("matmul")?
+                .call1((a.to_py::<f32>(&py), b.to_py::<f32>(&py)))?
+                .extract::<&PyArrayDyn<f32>>()?;
+            Ok(Tensor::from(result))
+        });
+        println!("\nTORCH: {:#?}", ground);
 
-                let result = prg
-                    .getattr("matmul")?
-                    .call1((a.to_py::<f32>(&py), b.to_py::<f32>(&py)))?
-                    .extract::<&PyArrayDyn<f32>>()?;
-                Ok(Tensor::from(result))
-            });
+        println!("\nA: {:#?}", a);
+        println!("\nB: {:#?}", b);
 
-            let gpu_device = Device::request_device(DeviceRequest::GPU)?;
-            let a = a.to(gpu_device.clone())?;
-            let b = b.to(gpu_device)?;
+        let gpu_device = Device::request_device(DeviceRequest::GPU)?;
+        let a = a.to(gpu_device.clone())?;
+        let b = b.to(gpu_device)?;
 
-            let c = a.matmul(&b)?;
-            c.resolve()?;
+        let c = a.matmul(&b)?;
+        c.resolve()?;
 
-            let our_result = c.to(cpu_device)?;
-            println!("\nTORCH: {:#?}", ground);
-            println!("\nOURS: {:#?}", our_result);
+        let our_result = c.to(cpu_device)?;
+        println!("\nOURS: {:#?}", our_result);
 
-            Ok(())
-        }
-        */
+        Ok(())
+    }
 }

From 4daf1f4daa96e50b479e23bde75379b9ffa658fb Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Sun, 21 Jan 2024 17:49:10 +0000
Subject: [PATCH 11/37] chore: very confusing

---
 crates/ratchet-core/src/tensor.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index 40cb2978..d5a5c2fd 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -353,7 +353,10 @@ impl Tensor {
         py: &'p pyo3::Python<'p>,
     ) -> &PyArrayDyn<T> {
         use numpy::PyArray;
-        PyArray::from_owned_array(*py, self.deep_clone().into_ndarray::<T>())
+        println!("TO PY: {:?}", self);
+        let cloned = self.deep_clone();
+        println!("CLONED: {:?}", cloned);
+        PyArray::from_owned_array(*py, cloned.into_ndarray::<T>())
     }
 
     pub fn deep_clone(&self) -> Tensor {

From f0fbc7ef9fe36a5160ca6eac26f2c462c5777eac Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Sun, 21 Jan 2024 17:59:46 +0000
Subject: [PATCH 12/37] chore: check allocator bug

---
 crates/ratchet-core/src/tensor.rs | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index d5a5c2fd..25f989b7 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -340,7 +340,7 @@ impl Tensor {
             let storage_guard = self.storage();
             let buffer = storage_guard.as_ref().unwrap().try_cpu().unwrap();
             let (ptr, _) = buffer.inner().into_raw_parts();
-            println!("INTO NDARRAY: {:?}", ptr);
+            println!("POINTER PASSED TO NDARRAY: {:?}", ptr);
             unsafe { ArrayViewD::from_shape_ptr(shape, ptr as *const T).to_owned() }
         } else {
             ArrayViewD::from_shape(shape, &[]).unwrap().to_owned()
@@ -353,9 +353,7 @@ impl Tensor {
         py: &'p pyo3::Python<'p>,
     ) -> &PyArrayDyn<T> {
         use numpy::PyArray;
-        println!("TO PY: {:?}", self);
         let cloned = self.deep_clone();
-        println!("CLONED: {:?}", cloned);
         PyArray::from_owned_array(*py, cloned.into_ndarray::<T>())
     }
 
@@ -433,8 +431,8 @@ mod tests {
     #[test]
     fn test_pyo3() -> anyhow::Result<()> {
         let cpu_device = Device::request_device(DeviceRequest::CPU)?;
-        let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
-        let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+        let a = Tensor::randn::<f32>(shape![1024, 512], cpu_device.clone());
+        let b = Tensor::randn::<f32>(shape![512, 384], cpu_device.clone());
 
         let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
             let prg = PyModule::from_code(

From 14a4a1e0b870303bf35b5909b34977b8cdf5d335 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Sun, 21 Jan 2024 18:07:52 +0000
Subject: [PATCH 13/37] chore: print drop

---
 crates/ratchet-core/src/storage/cpu_buffer.rs | 1 +
 crates/ratchet-core/src/tensor.rs             | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/crates/ratchet-core/src/storage/cpu_buffer.rs b/crates/ratchet-core/src/storage/cpu_buffer.rs
index 5feecce7..a9e0d606 100644
--- a/crates/ratchet-core/src/storage/cpu_buffer.rs
+++ b/crates/ratchet-core/src/storage/cpu_buffer.rs
@@ -36,6 +36,7 @@ impl Clone for RawCPUBuffer {
 impl Drop for RawCPUBuffer {
     fn drop(&mut self) {
         if !self.0.is_null() && self.1.size() > 0 {
+            println!("DROPPING CPU BUFFER: {:p}", self.0);
             unsafe { std::alloc::dealloc(self.0, self.1) }
         }
     }
diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index 25f989b7..9fe056c5 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -413,6 +413,7 @@ mod tests {
 
     use super::*;
 
+    /*
     #[test]
     fn test_matmul() -> anyhow::Result<()> {
         let device = Device::request_device(DeviceRequest::GPU)?;
@@ -427,6 +428,7 @@ mod tests {
         println!("\nD: {:#?}", d);
         Ok(())
     }
+    */
 
     #[test]
     fn test_pyo3() -> anyhow::Result<()> {

From 70f01c569fdc8dc00b1c2f35035187f1a493a71d Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Sun, 21 Jan 2024 18:24:40 +0000
Subject: [PATCH 14/37] chore: strange

---
 crates/ratchet-core/src/storage/cpu_buffer.rs | 15 ++++++++-----
 crates/ratchet-core/src/tensor.rs             | 22 ++++++++-----------
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/crates/ratchet-core/src/storage/cpu_buffer.rs b/crates/ratchet-core/src/storage/cpu_buffer.rs
index a9e0d606..d7ae546b 100644
--- a/crates/ratchet-core/src/storage/cpu_buffer.rs
+++ b/crates/ratchet-core/src/storage/cpu_buffer.rs
@@ -25,18 +25,21 @@ impl RawCPUBuffer {
 
 impl Clone for RawCPUBuffer {
     fn clone(&self) -> Self {
-        let (ptr, layout) = self.into_raw_parts();
-        let alloc = unsafe { std::alloc::alloc(layout) };
-        unsafe { ptr.copy_to_nonoverlapping(alloc, layout.size()) };
-
-        Self(alloc, layout)
+        let data = if self.1.size() == 0 {
+            std::ptr::null()
+        } else {
+            let ptr = unsafe { std::alloc::alloc(self.1) };
+            assert!(!ptr.is_null());
+            ptr
+        } as *mut u8;
+        unsafe { self.0.copy_to_nonoverlapping(data, self.1.size()) };
+        Self(data, self.1)
     }
 }
 
 impl Drop for RawCPUBuffer {
     fn drop(&mut self) {
         if !self.0.is_null() && self.1.size() > 0 {
-            println!("DROPPING CPU BUFFER: {:p}", self.0);
             unsafe { std::alloc::dealloc(self.0, self.1) }
         }
     }
diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index 9fe056c5..7ffe2fe3 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -340,21 +340,19 @@ impl Tensor {
             let storage_guard = self.storage();
             let buffer = storage_guard.as_ref().unwrap().try_cpu().unwrap();
             let (ptr, _) = buffer.inner().into_raw_parts();
-            println!("POINTER PASSED TO NDARRAY: {:?}", ptr);
-            unsafe { ArrayViewD::from_shape_ptr(shape, ptr as *const T).to_owned() }
+            unsafe { ArrayViewD::from_shape_ptr(shape, ptr as *const T).into_owned() }
         } else {
-            ArrayViewD::from_shape(shape, &[]).unwrap().to_owned()
+            ArrayViewD::from_shape(shape, &[]).unwrap().into_owned()
         }
     }
 
     #[cfg(feature = "pyo3")]
-    pub fn to_py<'s, 'p: 's, T: TensorDType + numpy::Element>(
-        &'s self,
-        py: &'p pyo3::Python<'p>,
+    pub fn to_py<'p, T: TensorDType + numpy::Element>(
+        self,
+        py: pyo3::Python<'p>,
     ) -> &PyArrayDyn<T> {
         use numpy::PyArray;
-        let cloned = self.deep_clone();
-        PyArray::from_owned_array(*py, cloned.into_ndarray::<T>())
+        PyArray::from_owned_array(py, self.into_ndarray::<T>())
     }
 
     pub fn deep_clone(&self) -> Tensor {
@@ -413,7 +411,6 @@ mod tests {
 
     use super::*;
 
-    /*
     #[test]
     fn test_matmul() -> anyhow::Result<()> {
         let device = Device::request_device(DeviceRequest::GPU)?;
@@ -428,13 +425,12 @@ mod tests {
         println!("\nD: {:#?}", d);
         Ok(())
     }
-    */
 
     #[test]
     fn test_pyo3() -> anyhow::Result<()> {
         let cpu_device = Device::request_device(DeviceRequest::CPU)?;
-        let a = Tensor::randn::<f32>(shape![1024, 512], cpu_device.clone());
-        let b = Tensor::randn::<f32>(shape![512, 384], cpu_device.clone());
+        let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+        let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
 
         let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
             let prg = PyModule::from_code(
@@ -451,7 +447,7 @@ def matmul(a, b):
 
             let result = prg
                 .getattr("matmul")?
-                .call1((a.to_py::<f32>(&py), b.to_py::<f32>(&py)))?
+                .call1((a.clone().to_py::<f32>(py), b.clone().to_py::<f32>(py)))?
                 .extract::<&PyArrayDyn<f32>>()?;
             Ok(Tensor::from(result))
         });

From 4cf51d69438a7073d2410f471621f6f1866bed73 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Mon, 22 Jan 2024 10:26:17 +0000
Subject: [PATCH 15/37] chore: faster wasm-pack

---
 .github/workflows/rust.yml        |  4 +-
 crates/ratchet-core/src/tensor.rs | 87 ++++++++++++++++---------------
 2 files changed, 47 insertions(+), 44 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 3f36e9f6..37bd93c5 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -40,9 +40,9 @@ jobs:
           python-version: '3.10.6'
           cache: 'pip'
       - run: pip install -r requirements.txt
-      - name: Setup
+      - name: Install wasm-pack 
         run: |
-          cargo install wasm-pack
+          curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh
       - name: Build
         run: cargo build
       - name: Run tests
diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index 7ffe2fe3..e51454bc 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -426,46 +426,49 @@ mod tests {
         Ok(())
     }
 
-    #[test]
-    fn test_pyo3() -> anyhow::Result<()> {
-        let cpu_device = Device::request_device(DeviceRequest::CPU)?;
-        let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
-        let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
-
-        let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
-            let prg = PyModule::from_code(
-                py,
-                r#"
-import torch
-
-def matmul(a, b):
-    return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
-    "#,
-                "x.py",
-                "x",
-            )?;
-
-            let result = prg
-                .getattr("matmul")?
-                .call1((a.clone().to_py::<f32>(py), b.clone().to_py::<f32>(py)))?
-                .extract::<&PyArrayDyn<f32>>()?;
-            Ok(Tensor::from(result))
-        });
-        println!("\nTORCH: {:#?}", ground);
-
-        println!("\nA: {:#?}", a);
-        println!("\nB: {:#?}", b);
-
-        let gpu_device = Device::request_device(DeviceRequest::GPU)?;
-        let a = a.to(gpu_device.clone())?;
-        let b = b.to(gpu_device)?;
-
-        let c = a.matmul(&b)?;
-        c.resolve()?;
-
-        let our_result = c.to(cpu_device)?;
-        println!("\nOURS: {:#?}", our_result);
-
-        Ok(())
-    }
+    /*
+
+        #[test]
+        fn test_pyo3() -> anyhow::Result<()> {
+            let cpu_device = Device::request_device(DeviceRequest::CPU)?;
+            let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+            let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+
+            let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
+                let prg = PyModule::from_code(
+                    py,
+                    r#"
+    import torch
+
+    def matmul(a, b):
+        return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
+        "#,
+                    "x.py",
+                    "x",
+                )?;
+
+                let result = prg
+                    .getattr("matmul")?
+                    .call1((a.clone().to_py::<f32>(py), b.clone().to_py::<f32>(py)))?
+                    .extract::<&PyArrayDyn<f32>>()?;
+                Ok(Tensor::from(result))
+            });
+            println!("\nTORCH: {:#?}", ground);
+
+            println!("\nA: {:#?}", a);
+            println!("\nB: {:#?}", b);
+
+            let gpu_device = Device::request_device(DeviceRequest::GPU)?;
+            let a = a.to(gpu_device.clone())?;
+            let b = b.to(gpu_device)?;
+
+            let c = a.matmul(&b)?;
+            c.resolve()?;
+
+            let our_result = c.to(cpu_device)?;
+            println!("\nOURS: {:#?}", our_result);
+
+            Ok(())
+        }
+        */
 }

From 4388c98ee91144a2036f37bf5bdf9239c3bd2355 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Mon, 22 Jan 2024 10:28:20 +0000
Subject: [PATCH 16/37] chore: downgrade wgpu

---
 Cargo.toml                            | 2 +-
 crates/ratchet-core/src/gpu/device.rs | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 20543cea..6b06221c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,7 +19,7 @@ strip = true
 #debug = 2
 
 [workspace.dependencies]
-wgpu = { version = "0.19.0", features = ["fragile-send-sync-non-atomic-wasm"] }
+wgpu = { version = "0.18.0", features = ["fragile-send-sync-non-atomic-wasm", "expose-ids"] }
 anyhow = "1.0.40"
 bytemuck = "1.14.0"
 num-traits = "0.2.17"
diff --git a/crates/ratchet-core/src/gpu/device.rs b/crates/ratchet-core/src/gpu/device.rs
index 04bfb0f6..4cb52286 100644
--- a/crates/ratchet-core/src/gpu/device.rs
+++ b/crates/ratchet-core/src/gpu/device.rs
@@ -56,7 +56,7 @@ impl WgpuDevice {
         let adapter = Self::select_adapter()?;
 
         #[allow(unused_mut)]
-        let mut required_features = wgpu::Features::default();
+        let mut features = wgpu::Features::default();
         #[cfg(feature = "gpu-profiling")]
         {
             features |= wgpu::Features::TIMESTAMP_QUERY;
@@ -64,8 +64,8 @@ impl WgpuDevice {
 
         let mut device_descriptor = wgpu::DeviceDescriptor {
             label: Some("ratchet"),
-            required_features,
-            required_limits: Limits {
+            features,
+            limits: Limits {
                 max_buffer_size: MAX_BUFFER_SIZE,
                 max_storage_buffer_binding_size: MAX_BUFFER_SIZE as u32,
                 ..Default::default()
@@ -77,7 +77,7 @@ impl WgpuDevice {
                 "Failed to acq. device, trying again with reduced limits: {:?}",
                 e
             );
-            device_descriptor.required_limits = adapter.limits();
+            device_descriptor.limits = adapter.limits();
             adapter.request_device(&device_descriptor, None).await
         } else {
             device_request

From a90cb6322e931655c3a7cec42d4daeb9e8a65b7c Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Mon, 22 Jan 2024 10:35:42 +0000
Subject: [PATCH 17/37] chore: unused deps, add back tetst

---
 crates/ratchet-core/Cargo.toml    |  2 -
 crates/ratchet-core/src/tensor.rs | 67 +++++++++++++++----------------
 2 files changed, 32 insertions(+), 37 deletions(-)

diff --git a/crates/ratchet-core/Cargo.toml b/crates/ratchet-core/Cargo.toml
index f2099d76..31ea049f 100644
--- a/crates/ratchet-core/Cargo.toml
+++ b/crates/ratchet-core/Cargo.toml
@@ -28,9 +28,7 @@ slotmap = "1.0.7"
 parking_lot = "0.12.1"
 smallvec = "1.11.2"
 encase = "0.7.0"
-glam = "0.25.0"
 pollster = "0.3.0"
-futures-intrusive = "0.5.0"
 anyhow = "1.0.79"
 num = "0.4.1"
 rand_distr = { version = "0.4.3", optional = true }
diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index e51454bc..4c10b301 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -426,49 +426,46 @@ mod tests {
         Ok(())
     }
 
-    /*
-
-        #[test]
-        fn test_pyo3() -> anyhow::Result<()> {
-            let cpu_device = Device::request_device(DeviceRequest::CPU)?;
-            let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
-            let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
-
-            let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
-                let prg = PyModule::from_code(
-                    py,
-                    r#"
+    #[test]
+    fn test_pyo3() -> anyhow::Result<()> {
+        let cpu_device = Device::request_device(DeviceRequest::CPU)?;
+        let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+        let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+
+        let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
+            let prg = PyModule::from_code(
+                py,
+                r#"
     import torch
 
     def matmul(a, b):
         return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
         "#,
-                    "x.py",
-                    "x",
-                )?;
-
-                let result = prg
-                    .getattr("matmul")?
-                    .call1((a.clone().to_py::<f32>(py), b.clone().to_py::<f32>(py)))?
-                    .extract::<&PyArrayDyn<f32>>()?;
-                Ok(Tensor::from(result))
-            });
-            println!("\nTORCH: {:#?}", ground);
+                "x.py",
+                "x",
+            )?;
+
+            let result = prg
+                .getattr("matmul")?
+                .call1((a.clone().to_py::<f32>(py), b.clone().to_py::<f32>(py)))?
+                .extract::<&PyArrayDyn<f32>>()?;
+            Ok(Tensor::from(result))
+        });
+        println!("\nTORCH: {:#?}", ground);
 
-            println!("\nA: {:#?}", a);
-            println!("\nB: {:#?}", b);
+        println!("\nA: {:#?}", a);
+        println!("\nB: {:#?}", b);
 
-            let gpu_device = Device::request_device(DeviceRequest::GPU)?;
-            let a = a.to(gpu_device.clone())?;
-            let b = b.to(gpu_device)?;
+        let gpu_device = Device::request_device(DeviceRequest::GPU)?;
+        let a = a.to(gpu_device.clone())?;
+        let b = b.to(gpu_device)?;
 
-            let c = a.matmul(&b)?;
-            c.resolve()?;
+        let c = a.matmul(&b)?;
+        c.resolve()?;
 
-            let our_result = c.to(cpu_device)?;
-            println!("\nOURS: {:#?}", our_result);
+        let our_result = c.to(cpu_device)?;
+        println!("\nOURS: {:#?}", our_result);
 
-            Ok(())
-        }
-        */
+        Ok(())
+    }
 }

From d40910ba7b757b09f9d83d348fc50b4d22bb39e6 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Mon, 22 Jan 2024 10:40:41 +0000
Subject: [PATCH 18/37] chore: tests interacting\?

---
 crates/ratchet-core/src/tensor.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index 4c10b301..a627f8dc 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -414,8 +414,8 @@ mod tests {
     #[test]
     fn test_matmul() -> anyhow::Result<()> {
         let device = Device::request_device(DeviceRequest::GPU)?;
-        let a = Tensor::randn::<f32>(shape![1024, 1024], device.clone());
-        let b = Tensor::randn::<f32>(shape![1024, 1024], device.clone());
+        let a = Tensor::randn::<f32>(shape![512, 512], device.clone());
+        let b = Tensor::randn::<f32>(shape![512, 512], device.clone());
         let c = a.matmul(&b)?;
         c.resolve()?;
         println!("\nA: {:#?}", a);
@@ -436,10 +436,10 @@ mod tests {
             let prg = PyModule::from_code(
                 py,
                 r#"
-    import torch
+import torch
 
-    def matmul(a, b):
-        return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
+def matmul(a, b):
+    return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
         "#,
                 "x.py",
                 "x",

From fd6e6df4f2da6a310f3d873a5e5a2ff79f818c24 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Mon, 22 Jan 2024 10:52:42 +0000
Subject: [PATCH 19/37] chore: mem management

chore: confusing

chore: dbg

chore: dbg

chore: dbg

chore: dbg

chore: dbg

chore: dbg

chore: dbg

chore: dbg

chore: dbg

chore: remove arc

chore: remove arc

chore: remove arc

chore: remove arc

chore: remove arc

chore: remove arc

chore: remove arc

chore: remove arc

chore: remove arc
---
 .github/workflows/rust.yml                    |  8 +-
 Cargo.toml                                    |  2 +-
 crates/ratchet-core/src/device.rs             |  4 +-
 crates/ratchet-core/src/quant.rs              |  5 --
 crates/ratchet-core/src/storage/cpu_buffer.rs | 77 +++++++++---------
 crates/ratchet-core/src/storage/gpu_buffer.rs |  1 -
 crates/ratchet-core/src/storage/mod.rs        |  7 +-
 crates/ratchet-core/src/tensor.rs             | 80 +++++++++----------
 8 files changed, 84 insertions(+), 100 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 37bd93c5..8caa488c 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -35,17 +35,15 @@ jobs:
           sudo apt install -y libegl1-mesa libgl1-mesa-dri libxcb-xfixes0-dev vulkan-sdk mesa-vulkan-drivers pkg-config libasound2-dev
 
       - name: Setup python 
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
         with:
           python-version: '3.10.6'
           cache: 'pip'
       - run: pip install -r requirements.txt
+      - name: Run tests
+        run: cargo test -- --nocapture
       - name: Install wasm-pack 
         run: |
           curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh
-      - name: Build
-        run: cargo build
-      - name: Run tests
-        run: cargo test -- --nocapture
       - name: Run integration tests
         run: (cd crates/ratchet-integration-tests;sh run-tests.sh)
diff --git a/Cargo.toml b/Cargo.toml
index 6b06221c..a8cae20f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,7 @@ strip = true
 [workspace.dependencies]
 wgpu = { version = "0.18.0", features = ["fragile-send-sync-non-atomic-wasm", "expose-ids"] }
 anyhow = "1.0.40"
-bytemuck = "1.14.0"
+bytemuck = { version = "1.14.0", features=["wasm_simd", "aarch64_simd", "extern_crate_alloc"] }
 num-traits = "0.2.17"
 half = { version = "2.3.1", features = ["num-traits", "bytemuck"] }
 derive-new = "0.6.0"
diff --git a/crates/ratchet-core/src/device.rs b/crates/ratchet-core/src/device.rs
index d30e8da1..929f7f4a 100644
--- a/crates/ratchet-core/src/device.rs
+++ b/crates/ratchet-core/src/device.rs
@@ -51,7 +51,9 @@ impl Device {
     pub fn request_device(request: DeviceRequest) -> Result<Self, DeviceError> {
         match request {
             DeviceRequest::CPU => Ok(Device::CPU),
-            DeviceRequest::GPU => Ok(Device::GPU(pollster::block_on(WgpuDevice::new())?)),
+            DeviceRequest::GPU => Ok(Device::GPU(pollster::block_on(async {
+                WgpuDevice::new().await
+            })?)),
         }
     }
 
diff --git a/crates/ratchet-core/src/quant.rs b/crates/ratchet-core/src/quant.rs
index 1bbad339..c71b9178 100644
--- a/crates/ratchet-core/src/quant.rs
+++ b/crates/ratchet-core/src/quant.rs
@@ -122,12 +122,9 @@ mod tests {
         let mut rng = rand::thread_rng();
         let range = Uniform::new(-0.2, 0.2);
         let matrix: Vec<f32> = (0..M * N).map(|_| rng.sample(range)).collect();
-        println!("Original matrix: {:?}", matrix);
 
         let (quantized_matrix, absmax) = super::sint8_quantize(&matrix, M, N);
-        println!("Absmax: {:?}", absmax);
         let dequantized_matrix = super::sint8_dequantize(&quantized_matrix, &absmax, M, N);
-        println!("Dequantized matrix: {:?}", dequantized_matrix);
         for i in 0..matrix.len() {
             assert!((matrix[i] - dequantized_matrix[i]).abs() < 0.001);
         }
@@ -138,12 +135,10 @@ mod tests {
         let matrix = vec![
             0.1, -0.1, 0.6, -0.5, 1.0, -1.0, 1.2, -1.2, 0.1, -0.1, 0.5, -0.5, 1.0, -1.0, 1.2, -1.2,
         ];
-        println!("{:?}", matrix);
         let (quantized_matrix, absmax) = super::sint4_quantize(&matrix, 4, 4);
         assert_eq!(quantized_matrix.len(), 2);
         assert_eq!(quantized_matrix, vec![2544293105, 2544292849]);
         let dequantized_matrix = super::sint4_dequantize(&quantized_matrix, absmax, 4, 4);
-        println!("{:?}", dequantized_matrix);
         for i in 0..matrix.len() {
             assert!((matrix[i] - dequantized_matrix[i]).abs() < 0.1);
         }
diff --git a/crates/ratchet-core/src/storage/cpu_buffer.rs b/crates/ratchet-core/src/storage/cpu_buffer.rs
index d7ae546b..f1e409fd 100644
--- a/crates/ratchet-core/src/storage/cpu_buffer.rs
+++ b/crates/ratchet-core/src/storage/cpu_buffer.rs
@@ -10,6 +10,10 @@ use crate::DType;
 pub struct RawCPUBuffer(*mut u8, Layout);
 
 impl RawCPUBuffer {
+    pub fn from_raw_parts(ptr: *mut u8, layout: Layout) -> Self {
+        Self(ptr, layout)
+    }
+
     pub fn into_raw_parts(&self) -> (*mut u8, Layout) {
         (self.0, self.1)
     }
@@ -21,19 +25,36 @@ impl RawCPUBuffer {
     pub fn as_bytes(&self) -> &[u8] {
         unsafe { std::slice::from_raw_parts(self.0, self.1.size()) }
     }
+
+    pub fn as_bytes_mut(&mut self) -> &mut [u8] {
+        unsafe { std::slice::from_raw_parts_mut(self.0, self.1.size()) }
+    }
+
+    pub fn uninitialized(size: usize, alignment: usize) -> Self {
+        let layout = std::alloc::Layout::from_size_align(size, alignment).unwrap();
+        let data = if size == 0 {
+            std::ptr::null()
+        } else {
+            let ptr = unsafe { std::alloc::alloc(layout) };
+            assert!(!ptr.is_null());
+            ptr
+        } as *mut u8;
+        Self(data, layout)
+    }
 }
 
 impl Clone for RawCPUBuffer {
     fn clone(&self) -> Self {
-        let data = if self.1.size() == 0 {
+        let (ptr, layout) = self.into_raw_parts();
+        let data = if layout.size() == 0 {
             std::ptr::null()
         } else {
-            let ptr = unsafe { std::alloc::alloc(self.1) };
+            let ptr = unsafe { std::alloc::alloc(layout) };
             assert!(!ptr.is_null());
             ptr
         } as *mut u8;
-        unsafe { self.0.copy_to_nonoverlapping(data, self.1.size()) };
-        Self(data, self.1)
+        unsafe { ptr.copy_to_nonoverlapping(data, layout.size()) };
+        Self(data, layout)
     }
 }
 
@@ -48,7 +69,7 @@ impl Drop for RawCPUBuffer {
 /// Managed CPU buffer
 #[derive(Debug, Clone, derive_new::new)]
 pub struct CPUBuffer {
-    inner: Arc<RawCPUBuffer>,
+    inner: RawCPUBuffer,
 }
 
 unsafe impl Send for CPUBuffer {}
@@ -61,49 +82,25 @@ impl CPUBuffer {
         Self::from_bytes(bytes, std::mem::align_of::<T>())
     }
 
-    pub fn inner(&self) -> &Arc<RawCPUBuffer> {
+    pub fn inner(&self) -> &RawCPUBuffer {
         &self.inner
     }
 
-    unsafe fn uninitialized(size: usize, alignment: usize) -> Self {
-        let layout = std::alloc::Layout::from_size_align(size, alignment).unwrap();
-        let data = if size == 0 {
-            std::ptr::null()
-        } else {
-            let ptr = std::alloc::alloc(layout);
-            assert!(!ptr.is_null());
-            ptr
-        } as *mut u8;
-        Self::from_raw_parts(data, layout)
-    }
-
-    pub fn from_raw_parts(data: *mut u8, layout: Layout) -> Self {
-        Self {
-            inner: Arc::new(RawCPUBuffer(data, layout)),
-        }
-    }
-
     pub fn from_bytes(bytes: &[u8], alignment: usize) -> Self {
-        let layout = std::alloc::Layout::from_size_align(bytes.len(), alignment).unwrap();
-        let data = if bytes.len() == 0 {
-            std::ptr::null()
-        } else {
-            let ptr = unsafe { std::alloc::alloc(layout) };
-            assert!(!ptr.is_null());
-            unsafe { ptr.copy_from_nonoverlapping(bytes.as_ptr(), bytes.len()) };
-            ptr
-        } as *mut u8;
-        Self::from_raw_parts(data, layout)
+        let mut raw = RawCPUBuffer::uninitialized(bytes.len(), alignment);
+        raw.as_bytes_mut().copy_from_slice(bytes);
+        Self::from(raw)
     }
 
     pub fn deep_clone(&self) -> Self {
-        let (ptr, layout) = self.inner().into_raw_parts();
-        println!("before deep clone: {:p}", ptr);
-        let alloc = unsafe { std::alloc::alloc(layout) };
-        unsafe { ptr.copy_to_nonoverlapping(alloc, layout.size()) };
-        println!("after deep clone: {:p}", alloc);
+        let raw_clone = (*self.inner()).clone();
+        Self::from(raw_clone)
+    }
+}
 
-        Self::from_raw_parts(alloc, layout)
+impl From<RawCPUBuffer> for CPUBuffer {
+    fn from(raw: RawCPUBuffer) -> Self {
+        CPUBuffer { inner: raw }
     }
 }
 
diff --git a/crates/ratchet-core/src/storage/gpu_buffer.rs b/crates/ratchet-core/src/storage/gpu_buffer.rs
index cd268568..1631d3c7 100644
--- a/crates/ratchet-core/src/storage/gpu_buffer.rs
+++ b/crates/ratchet-core/src/storage/gpu_buffer.rs
@@ -65,7 +65,6 @@ impl GPUBuffer {
     }
 
     pub fn deep_clone(&self, device: &WgpuDevice) -> Self {
-        //Here we need to create a buffer just like ours
         let clone = device
             .get_or_create_buffer(&BufferDescriptor::new(
                 self.inner.size(),
diff --git a/crates/ratchet-core/src/storage/mod.rs b/crates/ratchet-core/src/storage/mod.rs
index 921a6c20..f6bcc0d3 100644
--- a/crates/ratchet-core/src/storage/mod.rs
+++ b/crates/ratchet-core/src/storage/mod.rs
@@ -46,11 +46,8 @@ impl Storage {
 
     pub fn deep_clone(&self, device: &Device) -> Result<Self, DeviceError> {
         match self {
-            Storage::CPU(buf) => Ok(Storage::CPU(buf.deep_clone())),
-            Storage::GPU(buf) => {
-                let gpu_device = device.try_gpu()?;
-                Ok(Storage::GPU(buf.deep_clone(gpu_device)))
-            }
+            Storage::CPU(c) => Ok(Storage::CPU(c.deep_clone())),
+            _ => unimplemented!(),
         }
     }
 }
diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index a627f8dc..dcadedfe 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -1,7 +1,7 @@
 use crate::gpu::{CpuUniform, WgpuDevice};
 use crate::{
     ops::*, CPUBuffer, CompiledOp, DType, Device, DeviceStorage, Executable, GPUBuffer, Operation,
-    OperationError, Shape, Storage, Strides, TensorDType, TensorId,
+    OperationError, RawCPUBuffer, Shape, Storage, Strides, TensorDType, TensorId,
 };
 use crate::{BinaryOp, LazyOp};
 
@@ -340,29 +340,32 @@ impl Tensor {
             let storage_guard = self.storage();
             let buffer = storage_guard.as_ref().unwrap().try_cpu().unwrap();
             let (ptr, _) = buffer.inner().into_raw_parts();
-            unsafe { ArrayViewD::from_shape_ptr(shape, ptr as *const T).into_owned() }
+            unsafe { ArrayViewD::from_shape_ptr(shape, ptr as *const T).to_owned() }
         } else {
-            ArrayViewD::from_shape(shape, &[]).unwrap().into_owned()
+            ArrayViewD::from_shape(shape, &[]).unwrap().to_owned()
         }
     }
 
     #[cfg(feature = "pyo3")]
-    pub fn to_py<'p, T: TensorDType + numpy::Element>(
-        self,
-        py: pyo3::Python<'p>,
+    pub fn to_py<'s, 'p: 's, T: TensorDType + numpy::Element>(
+        &'s self,
+        py: &'p pyo3::Python<'p>,
     ) -> &PyArrayDyn<T> {
         use numpy::PyArray;
-        PyArray::from_owned_array(py, self.into_ndarray::<T>())
+        PyArray::from_owned_array(*py, self.clone().into_ndarray::<T>())
     }
 
     pub fn deep_clone(&self) -> Tensor {
-        let storage_guard = self.storage();
-        let storage = storage_guard.as_ref().unwrap();
-        let cloned_storage = storage.deep_clone(self.device()).unwrap();
+        let storage_clone = self
+            .storage()
+            .as_ref()
+            .unwrap()
+            .deep_clone(self.device())
+            .unwrap();
         Tensor::new(
-            LazyOp::Const,
+            self.op().clone(),
             self.view.clone(),
-            Some(cloned_storage),
+            Some(storage_clone),
             self.device.clone(),
         )
     }
@@ -382,12 +385,12 @@ impl<T: TensorDType> From<ArrayD<T>> for Tensor {
             let vec = it.into_raw_vec().into_boxed_slice();
             let ptr = Box::into_raw(vec) as *mut u8;
 
-            let cpu_buf = CPUBuffer::from_raw_parts(ptr, layout);
+            let raw_buf = RawCPUBuffer::from_raw_parts(ptr, layout);
             let meta = StorageView::new(shape, T::dt(), strides);
             Tensor::new(
                 LazyOp::Const,
                 meta,
-                Some(Storage::CPU(cpu_buf)),
+                Some(Storage::CPU(CPUBuffer::from(raw_buf))),
                 Device::CPU,
             )
         } else {
@@ -414,15 +417,12 @@ mod tests {
     #[test]
     fn test_matmul() -> anyhow::Result<()> {
         let device = Device::request_device(DeviceRequest::GPU)?;
-        let a = Tensor::randn::<f32>(shape![512, 512], device.clone());
-        let b = Tensor::randn::<f32>(shape![512, 512], device.clone());
+        let a = Tensor::randn::<f32>(shape![1024, 1024], device.clone());
+        let b = Tensor::randn::<f32>(shape![1024, 1024], device.clone());
         let c = a.matmul(&b)?;
         c.resolve()?;
-        println!("\nA: {:#?}", a);
-        println!("\nB: {:#?}", b);
-        println!("\nC: {:#?}", c);
         let d = c.to(Device::CPU)?;
-        println!("\nD: {:#?}", d);
+        println!("{:?}", d);
         Ok(())
     }
 
@@ -436,36 +436,32 @@ mod tests {
             let prg = PyModule::from_code(
                 py,
                 r#"
-import torch
+    import torch
 
-def matmul(a, b):
-    return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
-        "#,
+    def matmul(a, b):
+        return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
+                "#,
                 "x.py",
                 "x",
             )?;
 
-            let result = prg
+            let py_a = a.to_py::<f32>(&py);
+            let py_b = b.to_py::<f32>(&py);
+
+            let py_c = prg
                 .getattr("matmul")?
-                .call1((a.clone().to_py::<f32>(py), b.clone().to_py::<f32>(py)))?
+                .call1((py_a, py_b))?
                 .extract::<&PyArrayDyn<f32>>()?;
-            Ok(Tensor::from(result))
+            Ok(Tensor::from(py_c))
         });
-        println!("\nTORCH: {:#?}", ground);
-
-        println!("\nA: {:#?}", a);
-        println!("\nB: {:#?}", b);
-
-        let gpu_device = Device::request_device(DeviceRequest::GPU)?;
-        let a = a.to(gpu_device.clone())?;
-        let b = b.to(gpu_device)?;
-
-        let c = a.matmul(&b)?;
-        c.resolve()?;
-
-        let our_result = c.to(cpu_device)?;
-        println!("\nOURS: {:#?}", our_result);
-
+        let device = Device::request_device(DeviceRequest::GPU)?;
+        let a_gpu = a.to(device.clone())?;
+        let b_gpu = b.to(device.clone())?;
+        let c_gpu = a_gpu.matmul(&b_gpu)?;
+        c_gpu.resolve()?;
+        let d_gpu = c_gpu.to(Device::CPU)?;
+        println!("Ours: {:?}", d_gpu);
+        println!("Ground: {:?}", ground);
         Ok(())
     }
 }

From 52dbfc351ad1a0fd3af3698396453685401824e2 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Mon, 22 Jan 2024 13:11:29 +0000
Subject: [PATCH 20/37] chore: try

---
 .github/workflows/rust.yml        |  2 +-
 crates/ratchet-core/src/tensor.rs | 18 +++++++++++-------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 8caa488c..8c2432b5 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -41,7 +41,7 @@ jobs:
           cache: 'pip'
       - run: pip install -r requirements.txt
       - name: Run tests
-        run: cargo test -- --nocapture
+        run: cargo test dbg -- --nocapture 
       - name: Install wasm-pack 
         run: |
           curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh
diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index dcadedfe..dacb43bb 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -415,17 +415,20 @@ mod tests {
     use super::*;
 
     #[test]
-    fn test_matmul() -> anyhow::Result<()> {
+    fn dbg() -> anyhow::Result<()> {
         let device = Device::request_device(DeviceRequest::GPU)?;
-        let a = Tensor::randn::<f32>(shape![1024, 1024], device.clone());
-        let b = Tensor::randn::<f32>(shape![1024, 1024], device.clone());
-        let c = a.matmul(&b)?;
-        c.resolve()?;
-        let d = c.to(Device::CPU)?;
-        println!("{:?}", d);
+        for _ in 0..10 {
+            let a = Tensor::randn::<f32>(shape![128, 128], device.clone());
+            let b = Tensor::randn::<f32>(shape![128, 128], device.clone());
+            let c = a.matmul(&b)?;
+            c.resolve()?;
+            let d = c.to(Device::CPU)?;
+            println!("{:?}", d);
+        }
         Ok(())
     }
 
+    /*
     #[test]
     fn test_pyo3() -> anyhow::Result<()> {
         let cpu_device = Device::request_device(DeviceRequest::CPU)?;
@@ -464,4 +467,5 @@ mod tests {
         println!("Ground: {:?}", ground);
         Ok(())
     }
+    */
 }

From 1528bdcfa334811598c7591513f36200a7451cd4 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Mon, 22 Jan 2024 13:16:58 +0000
Subject: [PATCH 21/37] chore: try

---
 .github/workflows/rust.yml        |  2 +-
 crates/ratchet-core/src/tensor.rs | 15 +++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 8c2432b5..b7356739 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -41,7 +41,7 @@ jobs:
           cache: 'pip'
       - run: pip install -r requirements.txt
       - name: Run tests
-        run: cargo test dbg -- --nocapture 
+        run: cargo test tensor -- --nocapture 
       - name: Install wasm-pack 
         run: |
           curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh
diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index dacb43bb..8e0467a1 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -416,13 +416,16 @@ mod tests {
 
     #[test]
     fn dbg() -> anyhow::Result<()> {
-        let device = Device::request_device(DeviceRequest::GPU)?;
+        let cpu_device = Device::request_device(DeviceRequest::CPU)?;
+        let gpu_device = Device::request_device(DeviceRequest::GPU)?;
         for _ in 0..10 {
-            let a = Tensor::randn::<f32>(shape![128, 128], device.clone());
-            let b = Tensor::randn::<f32>(shape![128, 128], device.clone());
-            let c = a.matmul(&b)?;
-            c.resolve()?;
-            let d = c.to(Device::CPU)?;
+            let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+            let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+
+            let a_gpu = a.to(gpu_device.clone())?;
+            let b_gpu = b.to(gpu_device.clone())?;
+            let c_gpu = a_gpu.matmul(&b_gpu)?;
+            let d = c_gpu.to(Device::CPU)?;
             println!("{:?}", d);
         }
         Ok(())

From 54b6525210157c07b847a0421fc3e5ae3244be31 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Mon, 22 Jan 2024 13:19:10 +0000
Subject: [PATCH 22/37] chore: try

---
 crates/ratchet-core/src/tensor.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index 8e0467a1..3cdf699d 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -419,12 +419,13 @@ mod tests {
         let cpu_device = Device::request_device(DeviceRequest::CPU)?;
         let gpu_device = Device::request_device(DeviceRequest::GPU)?;
         for _ in 0..10 {
-            let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
-            let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+            let a = Tensor::randn::<f32>(shape![128, 128], cpu_device.clone());
+            let b = Tensor::randn::<f32>(shape![128, 128], cpu_device.clone());
 
             let a_gpu = a.to(gpu_device.clone())?;
             let b_gpu = b.to(gpu_device.clone())?;
             let c_gpu = a_gpu.matmul(&b_gpu)?;
+            c_gpu.resolve()?;
             let d = c_gpu.to(Device::CPU)?;
             println!("{:?}", d);
         }

From a7d41fc4c1b77fc2d6ecba2b3e58eaf8bc5fc952 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Mon, 22 Jan 2024 13:23:16 +0000
Subject: [PATCH 23/37] chore: try

---
 crates/ratchet-core/src/tensor.rs | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index 3cdf699d..66001718 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -414,6 +414,7 @@ mod tests {
 
     use super::*;
 
+    /*
     #[test]
     fn dbg() -> anyhow::Result<()> {
         let cpu_device = Device::request_device(DeviceRequest::CPU)?;
@@ -431,8 +432,8 @@ mod tests {
         }
         Ok(())
     }
+    */
 
-    /*
     #[test]
     fn test_pyo3() -> anyhow::Result<()> {
         let cpu_device = Device::request_device(DeviceRequest::CPU)?;
@@ -443,24 +444,28 @@ mod tests {
             let prg = PyModule::from_code(
                 py,
                 r#"
-    import torch
+import torch
 
-    def matmul(a, b):
-        return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
+def matmul(a, b):
+    return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
                 "#,
                 "x.py",
                 "x",
             )?;
 
             let py_a = a.to_py::<f32>(&py);
+            println!("py_a: {:?}", py_a);
             let py_b = b.to_py::<f32>(&py);
+            println!("py_b: {:?}", py_b);
 
             let py_c = prg
                 .getattr("matmul")?
                 .call1((py_a, py_b))?
                 .extract::<&PyArrayDyn<f32>>()?;
+            println!("py_c: {:?}", py_c);
             Ok(Tensor::from(py_c))
         });
+        println!("Ground: {:?}", ground);
         let device = Device::request_device(DeviceRequest::GPU)?;
         let a_gpu = a.to(device.clone())?;
         let b_gpu = b.to(device.clone())?;
@@ -471,5 +476,4 @@ mod tests {
         println!("Ground: {:?}", ground);
         Ok(())
     }
-    */
 }

From 2d7d467a717b4966c92737213ded6db056b5a5ea Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Mon, 22 Jan 2024 13:29:25 +0000
Subject: [PATCH 24/37] chore: try

---
 crates/ratchet-core/src/tensor.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index 66001718..8113d46a 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -414,7 +414,6 @@ mod tests {
 
     use super::*;
 
-    /*
     #[test]
     fn dbg() -> anyhow::Result<()> {
         let cpu_device = Device::request_device(DeviceRequest::CPU)?;
@@ -432,7 +431,6 @@ mod tests {
         }
         Ok(())
     }
-    */
 
     #[test]
     fn test_pyo3() -> anyhow::Result<()> {

From d0ebe04ba671c88fdfc41ff4661067a0a5c90ff7 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Mon, 22 Jan 2024 13:36:09 +0000
Subject: [PATCH 25/37] chore: try

---
 .github/workflows/rust.yml                    |  2 +-
 crates/ratchet-core/src/storage/cpu_buffer.rs |  5 -----
 crates/ratchet-core/src/storage/gpu_buffer.rs |  1 +
 crates/ratchet-core/src/storage/mod.rs        |  7 -------
 crates/ratchet-core/src/tensor.rs             | 15 ---------------
 5 files changed, 2 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index b7356739..cf4b5480 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -41,7 +41,7 @@ jobs:
           cache: 'pip'
       - run: pip install -r requirements.txt
       - name: Run tests
-        run: cargo test tensor -- --nocapture 
+        run: cargo test tensor -- --test-threads=1 --nocapture 
       - name: Install wasm-pack 
         run: |
           curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh
diff --git a/crates/ratchet-core/src/storage/cpu_buffer.rs b/crates/ratchet-core/src/storage/cpu_buffer.rs
index f1e409fd..11323795 100644
--- a/crates/ratchet-core/src/storage/cpu_buffer.rs
+++ b/crates/ratchet-core/src/storage/cpu_buffer.rs
@@ -91,11 +91,6 @@ impl CPUBuffer {
         raw.as_bytes_mut().copy_from_slice(bytes);
         Self::from(raw)
     }
-
-    pub fn deep_clone(&self) -> Self {
-        let raw_clone = (*self.inner()).clone();
-        Self::from(raw_clone)
-    }
 }
 
 impl From<RawCPUBuffer> for CPUBuffer {
diff --git a/crates/ratchet-core/src/storage/gpu_buffer.rs b/crates/ratchet-core/src/storage/gpu_buffer.rs
index 1631d3c7..3592e004 100644
--- a/crates/ratchet-core/src/storage/gpu_buffer.rs
+++ b/crates/ratchet-core/src/storage/gpu_buffer.rs
@@ -64,6 +64,7 @@ impl GPUBuffer {
         self.inner.usage()
     }
 
+    #[allow(unused)]
     pub fn deep_clone(&self, device: &WgpuDevice) -> Self {
         let clone = device
             .get_or_create_buffer(&BufferDescriptor::new(
diff --git a/crates/ratchet-core/src/storage/mod.rs b/crates/ratchet-core/src/storage/mod.rs
index f6bcc0d3..fe051014 100644
--- a/crates/ratchet-core/src/storage/mod.rs
+++ b/crates/ratchet-core/src/storage/mod.rs
@@ -43,13 +43,6 @@ impl Storage {
             _ => unimplemented!(),
         }
     }
-
-    pub fn deep_clone(&self, device: &Device) -> Result<Self, DeviceError> {
-        match self {
-            Storage::CPU(c) => Ok(Storage::CPU(c.deep_clone())),
-            _ => unimplemented!(),
-        }
-    }
 }
 
 pub trait DeviceStorage: std::fmt::Debug + Clone + 'static {
diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index 8113d46a..ee595e87 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -354,21 +354,6 @@ impl Tensor {
         use numpy::PyArray;
         PyArray::from_owned_array(*py, self.clone().into_ndarray::<T>())
     }
-
-    pub fn deep_clone(&self) -> Tensor {
-        let storage_clone = self
-            .storage()
-            .as_ref()
-            .unwrap()
-            .deep_clone(self.device())
-            .unwrap();
-        Tensor::new(
-            self.op().clone(),
-            self.view.clone(),
-            Some(storage_clone),
-            self.device.clone(),
-        )
-    }
 }
 
 #[cfg(feature = "pyo3")]

From bf5af2bfebd185078c372038a6e3415f22b3fbd3 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Mon, 22 Jan 2024 13:44:40 +0000
Subject: [PATCH 26/37] chore: no idea

---
 crates/ratchet-core/src/storage/cpu_buffer.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/crates/ratchet-core/src/storage/cpu_buffer.rs b/crates/ratchet-core/src/storage/cpu_buffer.rs
index 11323795..dac52b4a 100644
--- a/crates/ratchet-core/src/storage/cpu_buffer.rs
+++ b/crates/ratchet-core/src/storage/cpu_buffer.rs
@@ -61,6 +61,7 @@ impl Clone for RawCPUBuffer {
 impl Drop for RawCPUBuffer {
     fn drop(&mut self) {
         if !self.0.is_null() && self.1.size() > 0 {
+            println!("DROPPING: {:p}", self.0);
             unsafe { std::alloc::dealloc(self.0, self.1) }
         }
     }

From 71a43ac10e16d5d6682bedfacd397d26f3d642df Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Mon, 22 Jan 2024 15:43:00 +0000
Subject: [PATCH 27/37] chore: no idea

---
 crates/ratchet-core/src/tensor.rs | 55 ++++++++++++++++---------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index ee595e87..c553dc53 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -423,32 +423,34 @@ mod tests {
         let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
         let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
 
-        let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
-            let prg = PyModule::from_code(
-                py,
-                r#"
-import torch
-
-def matmul(a, b):
-    return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
-                "#,
-                "x.py",
-                "x",
-            )?;
-
-            let py_a = a.to_py::<f32>(&py);
-            println!("py_a: {:?}", py_a);
-            let py_b = b.to_py::<f32>(&py);
-            println!("py_b: {:?}", py_b);
-
-            let py_c = prg
-                .getattr("matmul")?
-                .call1((py_a, py_b))?
-                .extract::<&PyArrayDyn<f32>>()?;
-            println!("py_c: {:?}", py_c);
-            Ok(Tensor::from(py_c))
-        });
-        println!("Ground: {:?}", ground);
+        /*
+                let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
+                    let prg = PyModule::from_code(
+                        py,
+                        r#"
+        import torch
+
+        def matmul(a, b):
+            return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
+                        "#,
+                        "x.py",
+                        "x",
+                    )?;
+
+                    let py_a = a.to_py::<f32>(&py);
+                    println!("py_a: {:?}", py_a);
+                    let py_b = b.to_py::<f32>(&py);
+                    println!("py_b: {:?}", py_b);
+
+                    let py_c = prg
+                        .getattr("matmul")?
+                        .call1((py_a, py_b))?
+                        .extract::<&PyArrayDyn<f32>>()?;
+                    println!("py_c: {:?}", py_c);
+                    Ok(Tensor::from(py_c))
+                });
+                println!("Ground: {:?}", ground);
+                */
         let device = Device::request_device(DeviceRequest::GPU)?;
         let a_gpu = a.to(device.clone())?;
         let b_gpu = b.to(device.clone())?;
@@ -456,7 +458,6 @@ def matmul(a, b):
         c_gpu.resolve()?;
         let d_gpu = c_gpu.to(Device::CPU)?;
         println!("Ours: {:?}", d_gpu);
-        println!("Ground: {:?}", ground);
         Ok(())
     }
 }

From 8f0dd2b65dec5e2447af195aa7739a94622b8397 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Mon, 22 Jan 2024 15:57:55 +0000
Subject: [PATCH 28/37] chore: no idea

---
 .github/workflows/rust.yml                    |  2 +-
 crates/ratchet-core/src/storage/cpu_buffer.rs |  6 +-
 crates/ratchet-core/src/storage/mod.rs        |  7 ++
 crates/ratchet-core/src/tensor.rs             | 68 +++++++++++--------
 4 files changed, 54 insertions(+), 29 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index cf4b5480..b7356739 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -41,7 +41,7 @@ jobs:
           cache: 'pip'
       - run: pip install -r requirements.txt
       - name: Run tests
-        run: cargo test tensor -- --test-threads=1 --nocapture 
+        run: cargo test tensor -- --nocapture 
       - name: Install wasm-pack 
         run: |
           curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh
diff --git a/crates/ratchet-core/src/storage/cpu_buffer.rs b/crates/ratchet-core/src/storage/cpu_buffer.rs
index dac52b4a..9c870f7d 100644
--- a/crates/ratchet-core/src/storage/cpu_buffer.rs
+++ b/crates/ratchet-core/src/storage/cpu_buffer.rs
@@ -2,7 +2,7 @@ use bytemuck::NoUninit;
 
 use crate::{storage::DeviceStorage, Device, DeviceError, GPUBuffer, Shape, TensorDType};
 
-use std::{alloc::Layout, fmt::Debug, sync::Arc};
+use std::{alloc::Layout, fmt::Debug};
 
 use crate::DType;
 
@@ -92,6 +92,10 @@ impl CPUBuffer {
         raw.as_bytes_mut().copy_from_slice(bytes);
         Self::from(raw)
     }
+
+    pub fn deep_clone(&self) -> Result<Self, DeviceError> {
+        Ok(Self::from(self.inner().clone()))
+    }
 }
 
 impl From<RawCPUBuffer> for CPUBuffer {
diff --git a/crates/ratchet-core/src/storage/mod.rs b/crates/ratchet-core/src/storage/mod.rs
index fe051014..66652e85 100644
--- a/crates/ratchet-core/src/storage/mod.rs
+++ b/crates/ratchet-core/src/storage/mod.rs
@@ -43,6 +43,13 @@ impl Storage {
             _ => unimplemented!(),
         }
     }
+
+    pub fn deep_clone(&self) -> Result<Self, DeviceError> {
+        match self {
+            Storage::CPU(c) => Ok(Storage::CPU(c.deep_clone()?)),
+            _ => todo!(),
+        }
+    }
 }
 
 pub trait DeviceStorage: std::fmt::Debug + Clone + 'static {
diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index c553dc53..8b7382b1 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -352,7 +352,23 @@ impl Tensor {
         py: &'p pyo3::Python<'p>,
     ) -> &PyArrayDyn<T> {
         use numpy::PyArray;
-        PyArray::from_owned_array(*py, self.clone().into_ndarray::<T>())
+        assert!(
+            self.device().is_cpu(),
+            "Cannot convert non-CPU tensor to numpy array"
+        );
+        PyArray::from_owned_array(*py, self.deep_clone().into_ndarray::<T>())
+    }
+
+    pub fn deep_clone(&self) -> Tensor {
+        let storage_guard = self.storage();
+        let storage = storage_guard.as_ref().unwrap();
+        let cloned_storage = storage.deep_clone().unwrap();
+        Tensor::new(
+            LazyOp::Const,
+            self.view.clone(),
+            Some(cloned_storage),
+            self.device.clone(),
+        )
     }
 }
 
@@ -423,34 +439,32 @@ mod tests {
         let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
         let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
 
-        /*
-                let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
-                    let prg = PyModule::from_code(
-                        py,
-                        r#"
-        import torch
+        let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
+            let prg = PyModule::from_code(
+                py,
+                r#"
+import torch
 
-        def matmul(a, b):
-            return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
+def matmul(a, b):
+    return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
                         "#,
-                        "x.py",
-                        "x",
-                    )?;
-
-                    let py_a = a.to_py::<f32>(&py);
-                    println!("py_a: {:?}", py_a);
-                    let py_b = b.to_py::<f32>(&py);
-                    println!("py_b: {:?}", py_b);
-
-                    let py_c = prg
-                        .getattr("matmul")?
-                        .call1((py_a, py_b))?
-                        .extract::<&PyArrayDyn<f32>>()?;
-                    println!("py_c: {:?}", py_c);
-                    Ok(Tensor::from(py_c))
-                });
-                println!("Ground: {:?}", ground);
-                */
+                "x.py",
+                "x",
+            )?;
+
+            let py_a = a.to_py::<f32>(&py);
+            println!("py_a: {:?}", py_a);
+            let py_b = b.to_py::<f32>(&py);
+            println!("py_b: {:?}", py_b);
+
+            let py_c = prg
+                .getattr("matmul")?
+                .call1((py_a, py_b))?
+                .extract::<&PyArrayDyn<f32>>()?;
+            println!("py_c: {:?}", py_c);
+            Ok(Tensor::from(py_c))
+        });
+        println!("Ground: {:?}", ground);
         let device = Device::request_device(DeviceRequest::GPU)?;
         let a_gpu = a.to(device.clone())?;
         let b_gpu = b.to(device.clone())?;

From d63d2b6b6b10b99ffabecb710446380e5d01f8c9 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Mon, 22 Jan 2024 16:06:58 +0000
Subject: [PATCH 29/37] chore: no idea

---
 crates/ratchet-core/src/storage/cpu_buffer.rs |  6 +-
 crates/ratchet-core/src/tensor.rs             | 75 +++++++++----------
 2 files changed, 39 insertions(+), 42 deletions(-)

diff --git a/crates/ratchet-core/src/storage/cpu_buffer.rs b/crates/ratchet-core/src/storage/cpu_buffer.rs
index 9c870f7d..e6c26465 100644
--- a/crates/ratchet-core/src/storage/cpu_buffer.rs
+++ b/crates/ratchet-core/src/storage/cpu_buffer.rs
@@ -10,10 +10,6 @@ use crate::DType;
 pub struct RawCPUBuffer(*mut u8, Layout);
 
 impl RawCPUBuffer {
-    pub fn from_raw_parts(ptr: *mut u8, layout: Layout) -> Self {
-        Self(ptr, layout)
-    }
-
     pub fn into_raw_parts(&self) -> (*mut u8, Layout) {
         (self.0, self.1)
     }
@@ -39,6 +35,7 @@ impl RawCPUBuffer {
             assert!(!ptr.is_null());
             ptr
         } as *mut u8;
+        println!("Unintialized: {:p}", data);
         Self(data, layout)
     }
 }
@@ -53,6 +50,7 @@ impl Clone for RawCPUBuffer {
             assert!(!ptr.is_null());
             ptr
         } as *mut u8;
+        println!("Cloning: {:p} -> {:p}", ptr, data);
         unsafe { ptr.copy_to_nonoverlapping(data, layout.size()) };
         Self(data, layout)
     }
diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index 8b7382b1..f62235c0 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -386,7 +386,7 @@ impl<T: TensorDType> From<ArrayD<T>> for Tensor {
             let vec = it.into_raw_vec().into_boxed_slice();
             let ptr = Box::into_raw(vec) as *mut u8;
 
-            let raw_buf = RawCPUBuffer::from_raw_parts(ptr, layout);
+            let raw_buf = RawCPUBuffer::new(ptr, layout);
             let meta = StorageView::new(shape, T::dt(), strides);
             Tensor::new(
                 LazyOp::Const,
@@ -419,17 +419,15 @@ mod tests {
     fn dbg() -> anyhow::Result<()> {
         let cpu_device = Device::request_device(DeviceRequest::CPU)?;
         let gpu_device = Device::request_device(DeviceRequest::GPU)?;
-        for _ in 0..10 {
-            let a = Tensor::randn::<f32>(shape![128, 128], cpu_device.clone());
-            let b = Tensor::randn::<f32>(shape![128, 128], cpu_device.clone());
-
-            let a_gpu = a.to(gpu_device.clone())?;
-            let b_gpu = b.to(gpu_device.clone())?;
-            let c_gpu = a_gpu.matmul(&b_gpu)?;
-            c_gpu.resolve()?;
-            let d = c_gpu.to(Device::CPU)?;
-            println!("{:?}", d);
-        }
+        let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+        let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+
+        let a_gpu = a.to(gpu_device.clone())?;
+        let b_gpu = b.to(gpu_device.clone())?;
+        let c_gpu = a_gpu.matmul(&b_gpu)?;
+        c_gpu.resolve()?;
+        let d = c_gpu.to(Device::CPU)?;
+        println!("{:?}", d);
         Ok(())
     }
 
@@ -439,32 +437,33 @@ mod tests {
         let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
         let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
 
-        let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
-            let prg = PyModule::from_code(
-                py,
-                r#"
-import torch
-
-def matmul(a, b):
-    return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
-                        "#,
-                "x.py",
-                "x",
-            )?;
-
-            let py_a = a.to_py::<f32>(&py);
-            println!("py_a: {:?}", py_a);
-            let py_b = b.to_py::<f32>(&py);
-            println!("py_b: {:?}", py_b);
-
-            let py_c = prg
-                .getattr("matmul")?
-                .call1((py_a, py_b))?
-                .extract::<&PyArrayDyn<f32>>()?;
-            println!("py_c: {:?}", py_c);
-            Ok(Tensor::from(py_c))
-        });
-        println!("Ground: {:?}", ground);
+        /*
+                let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
+                    let prg = PyModule::from_code(
+                        py,
+                        r#"
+        import torch
+
+        def matmul(a, b):
+            return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
+                                "#,
+                        "x.py",
+                        "x",
+                    )?;
+
+                    let py_a = a.to_py::<f32>(&py);
+                    println!("py_a: {:?}", py_a);
+                    let py_b = b.to_py::<f32>(&py);
+                    println!("py_b: {:?}", py_b);
+
+                    let py_c = prg
+                        .getattr("matmul")?
+                        .call1((py_a, py_b))?
+                        .extract::<&PyArrayDyn<f32>>()?;
+                    println!("py_c: {:?}", py_c);
+                    Ok(Tensor::from(py_c))
+                });
+                */
         let device = Device::request_device(DeviceRequest::GPU)?;
         let a_gpu = a.to(device.clone())?;
         let b_gpu = b.to(device.clone())?;

From 361091853ccfa27b8b9e5d18ea2b4b9693334540 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Mon, 22 Jan 2024 16:08:52 +0000
Subject: [PATCH 30/37] chore: no idea

---
 crates/ratchet-core/src/storage/cpu_buffer.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/crates/ratchet-core/src/storage/cpu_buffer.rs b/crates/ratchet-core/src/storage/cpu_buffer.rs
index e6c26465..3b170e6f 100644
--- a/crates/ratchet-core/src/storage/cpu_buffer.rs
+++ b/crates/ratchet-core/src/storage/cpu_buffer.rs
@@ -105,9 +105,8 @@ impl From<RawCPUBuffer> for CPUBuffer {
 impl DeviceStorage for CPUBuffer {
     fn to_device(&self, device: &Device) -> Result<GPUBuffer, DeviceError> {
         let gpu_device = device.try_gpu()?;
-        let raw = self.inner();
-        let (ptr, layout) = raw.into_raw_parts();
-        let bytes = unsafe { std::slice::from_raw_parts(ptr, layout.size()) };
+        let bytes = self.inner().as_bytes();
+        let layout = self.inner().1;
         Ok(GPUBuffer::from_bytes(bytes, layout.align(), gpu_device))
     }
 

From 4bdcadaf5aa885e3c6a31d12bc4874e1060739a9 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Mon, 22 Jan 2024 16:14:41 +0000
Subject: [PATCH 31/37] chore: no idea

---
 crates/ratchet-core/src/storage/cpu_buffer.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/crates/ratchet-core/src/storage/cpu_buffer.rs b/crates/ratchet-core/src/storage/cpu_buffer.rs
index 3b170e6f..5281bd99 100644
--- a/crates/ratchet-core/src/storage/cpu_buffer.rs
+++ b/crates/ratchet-core/src/storage/cpu_buffer.rs
@@ -19,6 +19,7 @@ impl RawCPUBuffer {
     }
 
     pub fn as_bytes(&self) -> &[u8] {
+        println!("Reading: {:p}", self.0);
         unsafe { std::slice::from_raw_parts(self.0, self.1.size()) }
     }
 

From d19f5a45a13b0631dcc5e09523df296b79999b06 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Mon, 22 Jan 2024 16:16:56 +0000
Subject: [PATCH 32/37] chore: no idea

---
 crates/ratchet-core/src/tensor.rs | 90 ++++++++++++++++---------------
 1 file changed, 48 insertions(+), 42 deletions(-)

diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index f62235c0..0f6aaa4b 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -428,49 +428,55 @@ mod tests {
         c_gpu.resolve()?;
         let d = c_gpu.to(Device::CPU)?;
         println!("{:?}", d);
+        let a_cpu = a_gpu.to(Device::CPU)?;
+        println!("{:?}", a_cpu);
+        let b_cpu = b_gpu.to(Device::CPU)?;
+        println!("{:?}", b_cpu);
         Ok(())
     }
 
-    #[test]
-    fn test_pyo3() -> anyhow::Result<()> {
-        let cpu_device = Device::request_device(DeviceRequest::CPU)?;
-        let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
-        let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
-
-        /*
-                let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
-                    let prg = PyModule::from_code(
-                        py,
-                        r#"
-        import torch
-
-        def matmul(a, b):
-            return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
-                                "#,
-                        "x.py",
-                        "x",
-                    )?;
-
-                    let py_a = a.to_py::<f32>(&py);
-                    println!("py_a: {:?}", py_a);
-                    let py_b = b.to_py::<f32>(&py);
-                    println!("py_b: {:?}", py_b);
-
-                    let py_c = prg
-                        .getattr("matmul")?
-                        .call1((py_a, py_b))?
-                        .extract::<&PyArrayDyn<f32>>()?;
-                    println!("py_c: {:?}", py_c);
-                    Ok(Tensor::from(py_c))
-                });
-                */
-        let device = Device::request_device(DeviceRequest::GPU)?;
-        let a_gpu = a.to(device.clone())?;
-        let b_gpu = b.to(device.clone())?;
-        let c_gpu = a_gpu.matmul(&b_gpu)?;
-        c_gpu.resolve()?;
-        let d_gpu = c_gpu.to(Device::CPU)?;
-        println!("Ours: {:?}", d_gpu);
-        Ok(())
-    }
+    /*
+        #[test]
+        fn test_pyo3() -> anyhow::Result<()> {
+            let cpu_device = Device::request_device(DeviceRequest::CPU)?;
+            let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+            let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+
+            /*
+                    let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
+                        let prg = PyModule::from_code(
+                            py,
+                            r#"
+            import torch
+
+            def matmul(a, b):
+                return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
+                                    "#,
+                            "x.py",
+                            "x",
+                        )?;
+
+                        let py_a = a.to_py::<f32>(&py);
+                        println!("py_a: {:?}", py_a);
+                        let py_b = b.to_py::<f32>(&py);
+                        println!("py_b: {:?}", py_b);
+
+                        let py_c = prg
+                            .getattr("matmul")?
+                            .call1((py_a, py_b))?
+                            .extract::<&PyArrayDyn<f32>>()?;
+                        println!("py_c: {:?}", py_c);
+                        Ok(Tensor::from(py_c))
+                    });
+                    */
+            let device = Device::request_device(DeviceRequest::GPU)?;
+            let a_gpu = a.to(device.clone())?;
+            let b_gpu = b.to(device.clone())?;
+            let c_gpu = a_gpu.matmul(&b_gpu)?;
+            c_gpu.resolve()?;
+            let d_gpu = c_gpu.to(Device::CPU)?;
+            println!("Ours: {:?}", d_gpu);
+            Ok(())
+        }
+    */
 }

From aa651c2bfe71b96908fbd3fa79763ca251ad37ca Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Mon, 22 Jan 2024 16:21:38 +0000
Subject: [PATCH 33/37] chore: no idea

---
 crates/ratchet-core/src/tensor.rs | 88 +++++++++++++++----------------
 1 file changed, 44 insertions(+), 44 deletions(-)

diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index 0f6aaa4b..06b72fd7 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -415,6 +415,7 @@ mod tests {
 
     use super::*;
 
+    /*
     #[test]
     fn dbg() -> anyhow::Result<()> {
         let cpu_device = Device::request_device(DeviceRequest::CPU)?;
@@ -434,49 +435,48 @@ mod tests {
         println!("{:?}", b_cpu);
         Ok(())
     }
-
-    /*
-        #[test]
-        fn test_pyo3() -> anyhow::Result<()> {
-            let cpu_device = Device::request_device(DeviceRequest::CPU)?;
-            let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
-            let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
-
-            /*
-                    let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
-                        let prg = PyModule::from_code(
-                            py,
-                            r#"
-            import torch
-
-            def matmul(a, b):
-                return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
-                                    "#,
-                            "x.py",
-                            "x",
-                        )?;
-
-                        let py_a = a.to_py::<f32>(&py);
-                        println!("py_a: {:?}", py_a);
-                        let py_b = b.to_py::<f32>(&py);
-                        println!("py_b: {:?}", py_b);
-
-                        let py_c = prg
-                            .getattr("matmul")?
-                            .call1((py_a, py_b))?
-                            .extract::<&PyArrayDyn<f32>>()?;
-                        println!("py_c: {:?}", py_c);
-                        Ok(Tensor::from(py_c))
-                    });
-                    */
-            let device = Device::request_device(DeviceRequest::GPU)?;
-            let a_gpu = a.to(device.clone())?;
-            let b_gpu = b.to(device.clone())?;
-            let c_gpu = a_gpu.matmul(&b_gpu)?;
-            c_gpu.resolve()?;
-            let d_gpu = c_gpu.to(Device::CPU)?;
-            println!("Ours: {:?}", d_gpu);
-            Ok(())
-        }
     */
+
+    #[test]
+    fn test_pyo3() -> anyhow::Result<()> {
+        let cpu_device = Device::request_device(DeviceRequest::CPU)?;
+        let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+        let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+
+        /*
+                let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
+                    let prg = PyModule::from_code(
+                        py,
+                        r#"
+        import torch
+
+        def matmul(a, b):
+            return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
+                                "#,
+                        "x.py",
+                        "x",
+                    )?;
+
+                    let py_a = a.to_py::<f32>(&py);
+                    println!("py_a: {:?}", py_a);
+                    let py_b = b.to_py::<f32>(&py);
+                    println!("py_b: {:?}", py_b);
+
+                    let py_c = prg
+                        .getattr("matmul")?
+                        .call1((py_a, py_b))?
+                        .extract::<&PyArrayDyn<f32>>()?;
+                    println!("py_c: {:?}", py_c);
+                    Ok(Tensor::from(py_c))
+                });
+                */
+        let device = Device::request_device(DeviceRequest::GPU)?;
+        let a_gpu = a.to(device.clone())?;
+        let b_gpu = b.to(device.clone())?;
+        let c_gpu = a_gpu.matmul(&b_gpu)?;
+        c_gpu.resolve()?;
+        let d_gpu = c_gpu.to(Device::CPU)?;
+        println!("Ours: {:?}", d_gpu);
+        Ok(())
+    }
 }

From 4a1e0f0124861e6ccb29a08e1a5e97ff80f02735 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Mon, 22 Jan 2024 16:26:00 +0000
Subject: [PATCH 34/37] chore: no idea

---
 crates/ratchet-core/src/tensor.rs | 108 ++++++++++++++++++++----------
 1 file changed, 73 insertions(+), 35 deletions(-)

diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index 06b72fd7..177e4ae5 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -415,7 +415,6 @@ mod tests {
 
     use super::*;
 
-    /*
     #[test]
     fn dbg() -> anyhow::Result<()> {
         let cpu_device = Device::request_device(DeviceRequest::CPU)?;
@@ -435,48 +434,87 @@ mod tests {
         println!("{:?}", b_cpu);
         Ok(())
     }
-    */
 
     #[test]
-    fn test_pyo3() -> anyhow::Result<()> {
+    fn dbg2() -> anyhow::Result<()> {
         let cpu_device = Device::request_device(DeviceRequest::CPU)?;
+        let gpu_device = Device::request_device(DeviceRequest::GPU)?;
         let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
         let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
 
-        /*
-                let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
-                    let prg = PyModule::from_code(
-                        py,
-                        r#"
-        import torch
-
-        def matmul(a, b):
-            return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
-                                "#,
-                        "x.py",
-                        "x",
-                    )?;
-
-                    let py_a = a.to_py::<f32>(&py);
-                    println!("py_a: {:?}", py_a);
-                    let py_b = b.to_py::<f32>(&py);
-                    println!("py_b: {:?}", py_b);
-
-                    let py_c = prg
-                        .getattr("matmul")?
-                        .call1((py_a, py_b))?
-                        .extract::<&PyArrayDyn<f32>>()?;
-                    println!("py_c: {:?}", py_c);
-                    Ok(Tensor::from(py_c))
-                });
-                */
-        let device = Device::request_device(DeviceRequest::GPU)?;
-        let a_gpu = a.to(device.clone())?;
-        let b_gpu = b.to(device.clone())?;
+        let a_gpu = a.to(gpu_device.clone())?;
+        let b_gpu = b.to(gpu_device.clone())?;
         let c_gpu = a_gpu.matmul(&b_gpu)?;
         c_gpu.resolve()?;
-        let d_gpu = c_gpu.to(Device::CPU)?;
-        println!("Ours: {:?}", d_gpu);
+        let d = c_gpu.to(Device::CPU)?;
+        println!("{:?}", d);
+        let a_cpu = a_gpu.to(Device::CPU)?;
+        println!("{:?}", a_cpu);
+        let b_cpu = b_gpu.to(Device::CPU)?;
+        println!("{:?}", b_cpu);
         Ok(())
     }
+
+    #[test]
+    fn dbg3() -> anyhow::Result<()> {
+        let cpu_device = Device::request_device(DeviceRequest::CPU)?;
+        let gpu_device = Device::request_device(DeviceRequest::GPU)?;
+        let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+        let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+
+        let a_gpu = a.to(gpu_device.clone())?;
+        let b_gpu = b.to(gpu_device.clone())?;
+        let c_gpu = a_gpu.matmul(&b_gpu)?;
+        c_gpu.resolve()?;
+        let d = c_gpu.to(Device::CPU)?;
+        println!("{:?}", d);
+        let a_cpu = a_gpu.to(Device::CPU)?;
+        println!("{:?}", a_cpu);
+        let b_cpu = b_gpu.to(Device::CPU)?;
+        println!("{:?}", b_cpu);
+        Ok(())
+    }
+
+    /*
+        #[test]
+        fn test_pyo3() -> anyhow::Result<()> {
+            let cpu_device = Device::request_device(DeviceRequest::CPU)?;
+            let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+            let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+
+            let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
+                let prg = PyModule::from_code(
+                    py,
+                    r#"
+    import torch
+
+    def matmul(a, b):
+        return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
+                                    "#,
+                    "x.py",
+                    "x",
+                )?;
+
+                let py_a = a.to_py::<f32>(&py);
+                println!("py_a: {:?}", py_a);
+                let py_b = b.to_py::<f32>(&py);
+                println!("py_b: {:?}", py_b);
+
+                let py_c = prg
+                    .getattr("matmul")?
+                    .call1((py_a, py_b))?
+                    .extract::<&PyArrayDyn<f32>>()?;
+                println!("py_c: {:?}", py_c);
+                Ok(Tensor::from(py_c))
+            });
+            let device = Device::request_device(DeviceRequest::GPU)?;
+            let a_gpu = a.to(device.clone())?;
+            let b_gpu = b.to(device.clone())?;
+            let c_gpu = a_gpu.matmul(&b_gpu)?;
+            c_gpu.resolve()?;
+            let d_gpu = c_gpu.to(Device::CPU)?;
+            println!("Ours: {:?}", d_gpu);
+            Ok(())
+        }
+        */
 }

From e60da0137bef333a3aafac366c8e38c75cc2c3d3 Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Mon, 22 Jan 2024 16:30:31 +0000
Subject: [PATCH 35/37] chore: no idea

---
 crates/ratchet-core/src/tensor.rs | 37 +++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index 177e4ae5..7901dd52 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -415,6 +415,7 @@ mod tests {
 
     use super::*;
 
+    /*
     #[test]
     fn dbg() -> anyhow::Result<()> {
         let cpu_device = Device::request_device(DeviceRequest::CPU)?;
@@ -474,6 +475,42 @@ mod tests {
         println!("{:?}", b_cpu);
         Ok(())
     }
+    */
+
+    #[test]
+    fn dbg4() -> anyhow::Result<()> {
+        let cpu_device = Device::request_device(DeviceRequest::CPU)?;
+        let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+        let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
+
+        let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
+            let prg = PyModule::from_code(
+                py,
+                r#"
+    import torch
+
+    def matmul(a, b):
+        return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
+                                    "#,
+                "x.py",
+                "x",
+            )?;
+
+            let py_a = a.to_py::<f32>(&py);
+            println!("py_a: {:?}", py_a);
+            let py_b = b.to_py::<f32>(&py);
+            println!("py_b: {:?}", py_b);
+
+            let py_c = prg
+                .getattr("matmul")?
+                .call1((py_a, py_b))?
+                .extract::<&PyArrayDyn<f32>>()?;
+            println!("py_c: {:?}", py_c);
+            Ok(Tensor::from(py_c))
+        });
+        println!("ground: {:?}", ground);
+        Ok(())
+    }
 
     /*
         #[test]

From f84feaf47d579a296fc1ec2e6196cbfe407dcafe Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Mon, 22 Jan 2024 16:33:43 +0000
Subject: [PATCH 36/37] chore: dear god

---
 crates/ratchet-core/src/tensor.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index 7901dd52..1e206876 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -487,10 +487,10 @@ mod tests {
             let prg = PyModule::from_code(
                 py,
                 r#"
-    import torch
+import torch
 
-    def matmul(a, b):
-        return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
+def matmul(a, b):
+    return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy()
                                     "#,
                 "x.py",
                 "x",

From 53857f47f1f52b3f15d25a424a00c6bf9a3bb45f Mon Sep 17 00:00:00 2001
From: FL33TW00D <chris@fleetwood.dev>
Date: Mon, 22 Jan 2024 16:38:40 +0000
Subject: [PATCH 37/37] chore: dear god

---
 crates/ratchet-core/src/tensor.rs | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs
index 1e206876..d392db8e 100644
--- a/crates/ratchet-core/src/tensor.rs
+++ b/crates/ratchet-core/src/tensor.rs
@@ -483,7 +483,7 @@ mod tests {
         let a = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
         let b = Tensor::randn::<f32>(shape![1024, 1024], cpu_device.clone());
 
-        let ground: anyhow::Result<Tensor> = Python::with_gil(|py| {
+        Python::with_gil(|py| {
             let prg = PyModule::from_code(
                 py,
                 r#"
@@ -494,21 +494,15 @@ def matmul(a, b):
                                     "#,
                 "x.py",
                 "x",
-            )?;
+            )
+            .unwrap();
 
             let py_a = a.to_py::<f32>(&py);
             println!("py_a: {:?}", py_a);
-            let py_b = b.to_py::<f32>(&py);
-            println!("py_b: {:?}", py_b);
-
-            let py_c = prg
-                .getattr("matmul")?
-                .call1((py_a, py_b))?
-                .extract::<&PyArrayDyn<f32>>()?;
-            println!("py_c: {:?}", py_c);
-            Ok(Tensor::from(py_c))
         });
-        println!("ground: {:?}", ground);
+
+        let device = Device::request_device(DeviceRequest::GPU)?;
+        let a_gpu = a.to(device.clone())?;
         Ok(())
     }