From 1e3d725ecd6f06f873e1784d36e5b58654975d14 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Sun, 21 Jan 2024 11:46:57 +0000 Subject: [PATCH 01/37] chore: first pass pytorch integration --- .gitignore | 1 + crates/ratchet-core/Cargo.toml | 12 ++- crates/ratchet-core/src/storage/mod.rs | 10 +++ crates/ratchet-core/src/tensor.rs | 113 ++++++++++++++++++++++++- justfile | 6 ++ 5 files changed, 139 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 6985cf1b..ce0eaee0 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ Cargo.lock # MSVC Windows builds of rustc generate these, which store debugging information *.pdb +.python-version diff --git a/crates/ratchet-core/Cargo.toml b/crates/ratchet-core/Cargo.toml index 97b1619b..f2099d76 100644 --- a/crates/ratchet-core/Cargo.toml +++ b/crates/ratchet-core/Cargo.toml @@ -4,7 +4,8 @@ version = "0.1.0" edition = "2021" [features] -default = ["rand"] +default = ["rand", "pyo3"] +pyo3 = ["dep:pyo3", "dep:numpy", "dep:ndarray"] gpu_profiling = [] rand = ["dep:rand", "dep:rand_distr"] @@ -36,5 +37,14 @@ rand_distr = { version = "0.4.3", optional = true } rand = { version = "0.8.4", optional = true } lazy_static = "1.4.0" +# Python bindings +pyo3 = { version = "0.20.2", features=["auto-initialize"], optional = true } +numpy = { version = "0.20.0", optional = true } +ndarray = { version = "0.15.6", optional = true } + [dev-dependencies] rand = "0.8.4" +pyo3 = { version = "0.20.2", features=["auto-initialize"] } +numpy = { version = "0.20.0" } +ndarray = { version = "0.15.6" } + diff --git a/crates/ratchet-core/src/storage/mod.rs b/crates/ratchet-core/src/storage/mod.rs index 0620beff..f998dc16 100644 --- a/crates/ratchet-core/src/storage/mod.rs +++ b/crates/ratchet-core/src/storage/mod.rs @@ -52,6 +52,16 @@ impl Storage { } } + pub fn try_cpu(&self) -> Result<&RawCPUBuffer, DeviceError> { + match self.raw.as_ref() { + Some(RawStorage::CPU(raw)) => Ok(raw), + _ => Err(DeviceError::DeviceMismatch( + "CPU".to_string(), + "GPU".to_string(), + )), + } + } + pub fn dump(&self, dtype: DType, full: bool) -> String { self.raw .as_ref() diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index ef72021d..da2e77ea 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -1,7 +1,7 @@ use crate::gpu::{CpuUniform, WgpuDevice}; use crate::{ ops::*, CompiledOp, DType, Device, DeviceStorage, Executable, Operation, OperationError, - RawStorage, Shape, Storage, Strides, TensorDType, TensorId, + RawCPUBuffer, RawStorage, Shape, Storage, Strides, TensorDType, TensorId, }; use crate::{BinaryOp, LazyOp}; @@ -12,6 +12,12 @@ use std::sync::Arc; #[cfg(feature = "rand")] use {rand::prelude::*, rand_distr::StandardNormal}; +#[cfg(feature = "pyo3")] +use { + ndarray::{ArrayD, ArrayViewD}, + numpy::PyArrayDyn, +}; + // thiserror error for Tensor #[derive(thiserror::Error, Debug)] pub enum TensorError { @@ -295,16 +301,87 @@ impl Tensor { _ => Ok(self.clone()), } } + + #[cfg(feature = "pyo3")] + pub fn into_ndarray(&self) -> ArrayD { + assert!(self.device().is_cpu()); + let storage = self.storage().try_read().unwrap(); + let raw_cpu = storage.try_cpu().unwrap(); + let shape = self.shape().to_vec(); + if self.num_bytes() != 0 { + let ptr = raw_cpu.inner().0 as *const T; + unsafe { ArrayViewD::from_shape_ptr(shape, ptr).to_owned() } + } else { + ArrayViewD::from_shape(shape, &[]).unwrap().to_owned() + } + } + + #[cfg(feature = "pyo3")] + pub fn to_py<'s, 'p: 's, T: TensorDType + numpy::Element>( + &'s self, + py: &'p pyo3::Python<'p>, + ) -> &PyArrayDyn { + use numpy::PyArray; + PyArray::from_owned_array(*py, self.clone().into_ndarray::()) + } +} + +#[cfg(feature = "pyo3")] +impl From> for Tensor { + fn from(it: ArrayD) -> Self { + if it.as_slice().is_some() { + let layout = std::alloc::Layout::from_size_align( + it.len() * std::mem::size_of::(), + std::mem::align_of::(), + ) + .unwrap(); + let shape = it.shape().to_vec().into(); + let strides = Strides::from(&shape); + let vec = it.into_raw_vec().into_boxed_slice(); + let ptr = Box::into_raw(vec) as *mut u8; + + let raw_buf = RawCPUBuffer::new(ptr, layout); + let storage = Storage::from(RawStorage::CPU(raw_buf)); + let meta = StorageView::new(shape, T::dt(), strides); + Tensor::new(LazyOp::Const, meta, storage, Device::CPU) + } else { + panic!("Cannot convert numpy array with non-contiguous memory layout to tensor"); + } + } +} + +#[cfg(feature = "pyo3")] +impl From<&PyArrayDyn> for Tensor { + fn from(array: &PyArrayDyn) -> Self { + Self::from(array.to_owned_array()) + } } #[cfg(test)] mod tests { + use pyo3::{types::PyModule, Python}; + use crate::{shape, DeviceRequest}; use super::*; #[test] - fn test_cfg() -> anyhow::Result<()> { + fn test_matmul() -> anyhow::Result<()> { + let device = Device::request_device(DeviceRequest::GPU)?; + let a = Tensor::randn::(shape![1024, 1024], device.clone()); + let b = Tensor::randn::(shape![1024, 1024], device.clone()); + let c = a.matmul(&b)?; + c.resolve()?; + println!("\nA: {:#?}", a); + println!("\nB: {:#?}", b); + println!("\nC: {:#?}", c); + let d = c.to(Device::CPU)?; + println!("\nD: {:#?}", d); + Ok(()) + } + + #[test] + fn test_pyo3() -> anyhow::Result<()> { let device = Device::request_device(DeviceRequest::GPU)?; let a = Tensor::randn::(shape![1024, 1024], device.clone()); let b = Tensor::randn::(shape![1024, 1024], device.clone()); @@ -315,6 +392,38 @@ mod tests { println!("\nC: {:#?}", c); let d = c.to(Device::CPU)?; println!("\nD: {:#?}", d); + + let a = a.to(Device::CPU)?; + let b = b.to(Device::CPU)?; + let c = Python::with_gil(|py| { + let npy_a = a.to_py::(&py); + let npy_b = b.to_py::(&py); + + let activators = PyModule::from_code( + py, + r#" +import numpy as np +import torch + +def matmul(a, b): + return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() +"#, + "x.py", + "x", + ) + .unwrap(); + + let result = activators + .getattr("matmul") + .unwrap() + .call1((npy_a, npy_b)) + .unwrap() + .extract::<&PyArrayDyn>() + .unwrap(); + Tensor::from(result) + }); + println!("\nC: {:#?}", c); + Ok(()) } } diff --git a/justfile b/justfile index 99357e51..107577c3 100644 --- a/justfile +++ b/justfile @@ -1,2 +1,8 @@ line-count: cd ./crates/ratchet-core && scc -irs --exclude-file kernels +install-pyo3: + env PYTHON_CONFIGURE_OPTS="--enable-shared" pyenv install --verbose 3.10.6 + echo "Please PYO3_PYTHON to your .bashrc or .zshrc" +wasm CRATE: + RUSTFLAGS=--cfg=web_sys_unstable_apis wasm-pack build --target web -d `pwd`/target/pkg/{{CRATE}} --out-name {{CRATE}} ./crates/{{CRATE}} --release + From 0afb27d46ffe2ddb2521ea8c7fc69a5f1dada6b2 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Sun, 21 Jan 2024 11:51:15 +0000 Subject: [PATCH 02/37] chore: cleaning --- crates/ratchet-core/src/tensor.rs | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index da2e77ea..97f57639 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -395,14 +395,10 @@ mod tests { let a = a.to(Device::CPU)?; let b = b.to(Device::CPU)?; - let c = Python::with_gil(|py| { - let npy_a = a.to_py::(&py); - let npy_b = b.to_py::(&py); - - let activators = PyModule::from_code( + let c: anyhow::Result = Python::with_gil(|py| { + let prg = PyModule::from_code( py, r#" -import numpy as np import torch def matmul(a, b): @@ -410,19 +406,15 @@ def matmul(a, b): "#, "x.py", "x", - ) - .unwrap(); + )?; - let result = activators - .getattr("matmul") - .unwrap() - .call1((npy_a, npy_b)) - .unwrap() - .extract::<&PyArrayDyn>() - .unwrap(); - Tensor::from(result) + let result = prg + .getattr("matmul")? + .call1((a.to_py::(&py), b.to_py::(&py)))? + .extract::<&PyArrayDyn>()?; + Ok(Tensor::from(result)) }); - println!("\nC: {:#?}", c); + println!("\nTORCH: {:#?}", c); Ok(()) } From 8c2b54d6ab89f92eec495d79f994427722a37a32 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Sun, 21 Jan 2024 11:54:14 +0000 Subject: [PATCH 03/37] chore: add python to CI for tests --- .github/workflows/rust.yml | 6 ++++++ requirements.txt | 3 +++ 2 files changed, 9 insertions(+) create mode 100644 requirements.txt diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index ab38dbfd..cdaea5e8 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -33,6 +33,12 @@ jobs: sudo apt-get update sudo apt install -y libegl1-mesa libgl1-mesa-dri libxcb-xfixes0-dev vulkan-sdk mesa-vulkan-drivers pkg-config libasound2-dev + - name: Setup python + uses: actions/setup-python@v4 + with: + python-version: '3.10.6' + cache: 'pip' + - run: pip install -r requirements.txt - name: Setup run: | cargo install wasm-pack diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..cbd0e90e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +--extra-index-url https://download.pytorch.org/whl/cpu +numpy==1.24.3 +torch==2.0.1 From 78e3289f61c2c29a9b7fd1538fe7b8eb0f6e67b9 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Sun, 21 Jan 2024 12:18:15 +0000 Subject: [PATCH 04/37] chore: sigsev --- .github/workflows/rust.yml | 1 + Cargo.toml | 3 --- crates/ratchet-loader/Cargo.toml | 4 ++-- crates/ratchet-models/Cargo.toml | 2 +- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index cdaea5e8..7c62f447 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -10,6 +10,7 @@ env: CARGO_TERM_COLOR: always WGPU_DX12_COMPILER: dxc RUSTFLAGS: --cfg=web_sys_unstable_apis + RUST_BACKTRACE: 1 jobs: build: diff --git a/Cargo.toml b/Cargo.toml index 8d0becb9..20543cea 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,3 @@ derive-new = "0.6.0" log = "0.4.20" thiserror = "1.0.56" byteorder = "1.5.0" - -[workspace.dev-dependencies] -hf-hub = "0.3.0" diff --git a/crates/ratchet-loader/Cargo.toml b/crates/ratchet-loader/Cargo.toml index 3df25b7c..48be9dee 100644 --- a/crates/ratchet-loader/Cargo.toml +++ b/crates/ratchet-loader/Cargo.toml @@ -6,8 +6,8 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -half.workspace = true ratchet = { path = "../ratchet-core" } +half.workspace = true byteorder.workspace = true anyhow.workspace = true bytemuck.workspace = true @@ -16,4 +16,4 @@ derive-new.workspace = true log.workspace = true [dev-dependencies] -hf-hub = "0.3.2" +hf-hub = "0.3.2" diff --git a/crates/ratchet-models/Cargo.toml b/crates/ratchet-models/Cargo.toml index e88ae67d..c1e293d8 100644 --- a/crates/ratchet-models/Cargo.toml +++ b/crates/ratchet-models/Cargo.toml @@ -15,5 +15,5 @@ derive-new.workspace = true log.workspace = true [dev-dependencies] -hf-hub = { version = "0.3.0" } +hf-hub = "0.3.2" From 77d7d390a051b29a39c4f61d88270429d8636319 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Sun, 21 Jan 2024 12:29:58 +0000 Subject: [PATCH 05/37] chore: sigsev --- crates/ratchet-core/src/tensor.rs | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index 97f57639..89bbbbd5 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -382,19 +382,9 @@ mod tests { #[test] fn test_pyo3() -> anyhow::Result<()> { - let device = Device::request_device(DeviceRequest::GPU)?; + let device = Device::request_device(DeviceRequest::CPU)?; let a = Tensor::randn::(shape![1024, 1024], device.clone()); let b = Tensor::randn::(shape![1024, 1024], device.clone()); - let c = a.matmul(&b)?; - c.resolve()?; - println!("\nA: {:#?}", a); - println!("\nB: {:#?}", b); - println!("\nC: {:#?}", c); - let d = c.to(Device::CPU)?; - println!("\nD: {:#?}", d); - - let a = a.to(Device::CPU)?; - let b = b.to(Device::CPU)?; let c: anyhow::Result = Python::with_gil(|py| { let prg = PyModule::from_code( py, From ca8b76a7de613e7a13c0c072a8b8ed6ff88ee33f Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Sun, 21 Jan 2024 12:45:25 +0000 Subject: [PATCH 06/37] chore: double fre --- crates/ratchet-core/src/device.rs | 2 ++ crates/ratchet-core/src/storage/gpu_buffer.rs | 9 ++------- crates/ratchet-core/src/tensor.rs | 5 +++-- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/crates/ratchet-core/src/device.rs b/crates/ratchet-core/src/device.rs index 368db437..d30e8da1 100644 --- a/crates/ratchet-core/src/device.rs +++ b/crates/ratchet-core/src/device.rs @@ -14,6 +14,8 @@ pub enum DeviceError { BufferAllocationFailed(#[from] AllocatorError), #[error("Invalid GPU Buffer Usage, current: {0:?}, required: {1:?}")] InvalidBufferUsage(wgpu::BufferUsages, wgpu::BufferUsages), + #[error("Failed to transfer buffer with error: {0:?}")] + BufferTransferFailed(#[from] wgpu::BufferAsyncError), } pub enum DeviceRequest { diff --git a/crates/ratchet-core/src/storage/gpu_buffer.rs b/crates/ratchet-core/src/storage/gpu_buffer.rs index e7aa856a..a5baedc7 100644 --- a/crates/ratchet-core/src/storage/gpu_buffer.rs +++ b/crates/ratchet-core/src/storage/gpu_buffer.rs @@ -91,7 +91,7 @@ impl DeviceStorage for RawGPUBuffer { self.validate_usages(BufferUsages::COPY_SRC)?; let device = device.try_gpu()?; let buffer_slice = self.inner.slice(..); - let (tx, rx) = futures_intrusive::channel::shared::oneshot_channel(); + let (tx, rx) = std::sync::mpsc::channel(); let alignment = self.alignment; wgpu::util::DownloadBuffer::read_buffer( @@ -107,12 +107,7 @@ impl DeviceStorage for RawGPUBuffer { }, ); device.poll(wgpu::Maintain::Wait); - //TODO: fix unwrap - let storage = pollster::block_on(async { rx.receive().await }) - .ok_or(TensorError::TransferError) - .unwrap() - .map_err(|_| TensorError::TransferError) - .unwrap(); + let storage = rx.recv().unwrap()?; Ok(storage) } diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index 89bbbbd5..683e5dce 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -246,8 +246,8 @@ impl Tensor { pub fn compile(&self, uniform: &mut CpuUniform, device: &WgpuDevice) -> Option { match self.op() { - LazyOp::Binary(b) => Some(b.compile(self, uniform, device).unwrap()), - LazyOp::Matmul(m) => Some(m.compile(self, uniform, device).unwrap()), + LazyOp::Binary(b) => b.compile(self, uniform, device).ok(), + LazyOp::Matmul(m) => m.compile(self, uniform, device).ok(), LazyOp::Const => None, _ => unimplemented!(), } @@ -338,6 +338,7 @@ impl From> for Tensor { let shape = it.shape().to_vec().into(); let strides = Strides::from(&shape); let vec = it.into_raw_vec().into_boxed_slice(); + //This is causing a double free let ptr = Box::into_raw(vec) as *mut u8; let raw_buf = RawCPUBuffer::new(ptr, layout); From 6a98d051f5284b2d2de6a4ebfaa80401070676e1 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Sun, 21 Jan 2024 16:38:58 +0000 Subject: [PATCH 07/37] chore: double free --- ARCHITECTURE.md | 13 +- crates/ratchet-core/src/compiled_op.rs | 5 +- .../ratchet-core/src/gpu/buffer_allocator.rs | 25 ++- crates/ratchet-core/src/gpu/device.rs | 12 +- .../src/gpu/pools/bind_group_pool.rs | 4 +- .../ratchet-core/src/gpu/pools/buffer_pool.rs | 12 +- crates/ratchet-core/src/gpu/uniform.rs | 4 +- crates/ratchet-core/src/lib.rs | 1 + crates/ratchet-core/src/storage/cpu_buffer.rs | 116 ++++++++----- crates/ratchet-core/src/storage/gpu_buffer.rs | 47 ++---- crates/ratchet-core/src/storage/mod.rs | 106 +++--------- crates/ratchet-core/src/tensor.rs | 152 ++++++++++++------ 12 files changed, 257 insertions(+), 240 deletions(-) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 8196351c..99b36fab 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -10,17 +10,16 @@ Ratchet is designed for 1 thing only: **Inference on WebGPU**. This leads us to a few design decisions: 1. Ratchet is **lazy**, no computation is done until the entire computation graph is built and executed. This aligns closely with CUDAGraphs & Command buffers. -2. Ratchet supports **BOTH** static & dynamic graphs, this is key. - - The graph is implicitly defined through tensor operations. If any of the tensors are defined with a *symbolic dimension* (i.e a dimension not known until runtime, e.g sequence_len), the graph is dynamic. When the graph is dynamic, the graph is recompiled on inference pass (because runtime information is required). - - If no tensors contain a symbolic dimension, the graph is static. This means the graph is compiled into a single command buffer, and is repeatedly called with different input data (brrr). - - By exposing symbolic dimensions to the user, they can code their models with the CG in mind. +2. Ratchet supports **BOTH** static & dynamic graphs, see [Unified Graph Execution by Jittor](http://scis.scichina.com/en/2020/222103.pdf) for more details. 3. Memory planning is crucial. Creation and first bind of a buffer is *expensive* in WebGPU. Therefore, Ratchet uses a greedy algorithm to pool buffers for intermediate results of the CFG. -Why do this? - Take for example Whisper from OpenAI. This is an encoder-decoder model, where the encoder is completely static (i.e everything is known at compile time), and the decoder is very dynamic (KV caching, seq_len increments every step). By allowing both paradigms, we can maximise performance. +## Memory Management + +Ratchets top level `Tensor` is just an `Arc` around the `Inner`. Tensors should be cheaply cloneable. +`Inner` contains a struct `Storage`, this is an enum around our 2 managed structures for CPU & GPU: `CpuStorage` & `GpuStorage`. +`CpuStorage` is an `Arc>`, and `GpuStorage` is an `Arc>`. ## Quantization diff --git a/crates/ratchet-core/src/compiled_op.rs b/crates/ratchet-core/src/compiled_op.rs index 32226d56..1fc2904c 100644 --- a/crates/ratchet-core/src/compiled_op.rs +++ b/crates/ratchet-core/src/compiled_op.rs @@ -29,8 +29,9 @@ impl CompiledOp { let mut bind_group_entries = drvec![]; for tensor in srcs.iter().chain(std::iter::once(&dst)) { - let buf = tensor.storage().try_read().unwrap(); - let gpu_buf = &buf.try_gpu().unwrap().inner; + let storage_guard = tensor.storage(); + let storage = storage_guard.as_ref().unwrap(); + let gpu_buf = &storage.try_gpu().unwrap().inner; bind_group_entries.push(BindGroupEntry { handle: gpu_buf.handle, offset: 0, diff --git a/crates/ratchet-core/src/gpu/buffer_allocator.rs b/crates/ratchet-core/src/gpu/buffer_allocator.rs index 3988bff7..e6017fb2 100644 --- a/crates/ratchet-core/src/gpu/buffer_allocator.rs +++ b/crates/ratchet-core/src/gpu/buffer_allocator.rs @@ -2,7 +2,7 @@ use rustc_hash::FxHashMap; use wgpu::BufferUsages; use crate::{ - gpu::{BufferDescriptor, BufferPool, GPUBuffer, GpuBufferHandle}, + gpu::{BufferDescriptor, BufferPool, GpuBufferHandle, PooledGPUBuffer}, DeviceError, Tensor, TensorId, }; use std::cell::{Ref, RefCell, RefMut}; @@ -31,7 +31,7 @@ impl BufferAllocator { self.pool.borrow_mut().begin_pass(pass_index); } - pub fn get(&self, handle: GpuBufferHandle) -> GPUBuffer { + pub fn get(&self, handle: GpuBufferHandle) -> PooledGPUBuffer { self.pool.borrow().get(handle).unwrap() } @@ -43,7 +43,7 @@ impl BufferAllocator { self.pool.borrow_mut() } - pub fn create_buffer(&self, desc: &BufferDescriptor, device: &WgpuDevice) -> GPUBuffer { + pub fn create_buffer(&self, desc: &BufferDescriptor, device: &WgpuDevice) -> PooledGPUBuffer { self.pool.borrow_mut().get_or_create(desc, device) } @@ -52,13 +52,13 @@ impl BufferAllocator { desc: &BufferDescriptor, contents: &[u8], device: &WgpuDevice, - ) -> GPUBuffer { + ) -> PooledGPUBuffer { let buf = self.pool.borrow_mut().get_or_create(desc, device); device.queue().write_buffer(&buf.inner, 0, contents); buf } - pub fn create_uniform_init(&self, uniform: CpuUniform, device: &WgpuDevice) -> GPUBuffer { + pub fn create_uniform_init(&self, uniform: CpuUniform, device: &WgpuDevice) -> PooledGPUBuffer { let mut uniform = uniform.into_inner(); uniform.resize( uniform.len() + UNIFORM_ALIGN - uniform.len() % UNIFORM_ALIGN, @@ -85,9 +85,9 @@ impl BufferAllocator { fn graph_allocate( &self, descriptor: BufferDescriptor, - free: &mut Vec, + free: &mut Vec, device: &WgpuDevice, - ) -> GPUBuffer { + ) -> PooledGPUBuffer { let required_size = descriptor.size as _; let mut closest_index = None; let mut closest_size_diff: Option = None; @@ -121,17 +121,16 @@ impl BufferAllocator { &self, execution_order: &[Tensor], device: &WgpuDevice, - ) -> Result, DeviceError> { + ) -> Result, DeviceError> { let mut free = Vec::new(); //TODO: switch to BTreeMap let mut assignments = FxHashMap::default(); for t in execution_order { if t.resolved() { - let storage_resource = t - .storage() - .try_read() - .ok_or(AllocatorError::BufferNotFound)?; - assignments.insert(t.id(), storage_resource.try_gpu()?.inner.clone()); + assignments.insert( + t.id(), + t.storage().as_ref().unwrap().try_gpu()?.inner.clone(), + ); continue; } diff --git a/crates/ratchet-core/src/gpu/device.rs b/crates/ratchet-core/src/gpu/device.rs index fefe081e..a5fa847c 100644 --- a/crates/ratchet-core/src/gpu/device.rs +++ b/crates/ratchet-core/src/gpu/device.rs @@ -5,7 +5,7 @@ use wgpu::{Adapter, DeviceType, Limits}; use crate::DeviceError; -use super::{BufferDescriptor, GPUBuffer, PoolError}; +use super::{BufferDescriptor, PoolError, PooledGPUBuffer}; pub const MAX_BUFFER_SIZE: u64 = (2 << 29) - 1; @@ -151,21 +151,21 @@ impl WgpuDevice { &self, desc: &BufferDescriptor, contents: &[u8], - ) -> Result { + ) -> Result { Ok(self .buffer_allocator .create_buffer_init(desc, contents, self)) } - pub fn create_uniform_init(&self, cpu_uniform: CpuUniform) -> GPUBuffer { + pub fn create_uniform_init(&self, cpu_uniform: CpuUniform) -> PooledGPUBuffer { self.buffer_allocator.create_uniform_init(cpu_uniform, self) } - pub fn allocate_buffer(&self, desc: &BufferDescriptor) -> Result { + pub fn allocate_buffer(&self, desc: &BufferDescriptor) -> Result { Ok(self.buffer_allocator.create_buffer(desc, self)) } - pub fn get_buffer(&self, handle: GpuBufferHandle) -> Result { + pub fn get_buffer(&self, handle: GpuBufferHandle) -> Result { Ok(self.buffer_allocator.get(handle)) } @@ -221,7 +221,7 @@ impl WgpuDevice { &self, execution_order: &[Tensor], device: &WgpuDevice, - ) -> Result, DeviceError> { + ) -> Result, DeviceError> { self.buffer_allocator.allocate_cfg(execution_order, device) } } diff --git a/crates/ratchet-core/src/gpu/pools/bind_group_pool.rs b/crates/ratchet-core/src/gpu/pools/bind_group_pool.rs index 94b1fa81..4b97e3e2 100644 --- a/crates/ratchet-core/src/gpu/pools/bind_group_pool.rs +++ b/crates/ratchet-core/src/gpu/pools/bind_group_pool.rs @@ -11,7 +11,7 @@ slotmap::new_key_type! { pub struct GpuBindGroupHandle; } #[derive(Clone)] pub struct GpuBindGroup { resource: Arc>, - _owned_buffers: RVec, + _owned_buffers: RVec, } impl std::fmt::Debug for GpuBindGroup { @@ -98,7 +98,7 @@ impl BindGroupPool { pub fn get_or_create(&self, desc: &BindGroupDescriptor, device: &WgpuDevice) -> GpuBindGroup { // Retrieve strong handles to buffers and textures. // This way, an owner of a bind group handle keeps buffers & textures alive!. - let owned_buffers: RVec = { + let owned_buffers: RVec = { desc.entries .iter() .map(|e| device.get_buffer(e.handle).unwrap()) diff --git a/crates/ratchet-core/src/gpu/pools/buffer_pool.rs b/crates/ratchet-core/src/gpu/pools/buffer_pool.rs index 4ce897d2..926df821 100644 --- a/crates/ratchet-core/src/gpu/pools/buffer_pool.rs +++ b/crates/ratchet-core/src/gpu/pools/buffer_pool.rs @@ -1,6 +1,6 @@ // Adapted from https://github.com/rerun-io/rerun MIT licensed use super::{DynamicResource, DynamicResourcePool, DynamicResourcesDesc, PoolError}; -use crate::gpu::WgpuDevice; +use crate::{gpu::WgpuDevice, RawGPUBuffer}; #[derive(Clone, Hash, PartialEq, Eq, Debug, derive_new::new)] pub struct BufferDescriptor { @@ -19,8 +19,8 @@ slotmap::new_key_type! { pub struct GpuBufferHandle; } /// A reference-counter baked buffer. /// Once all instances are dropped, the buffer will be marked for reclamation in the following pass. -pub type GPUBuffer = - std::sync::Arc>; +pub type PooledGPUBuffer = + std::sync::Arc>; impl DynamicResourcesDesc for BufferDescriptor { fn resource_size_in_bytes(&self) -> u64 { @@ -37,7 +37,7 @@ impl DynamicResourcesDesc for BufferDescriptor { } pub struct BufferPool { - inner: DynamicResourcePool, + inner: DynamicResourcePool, } impl BufferPool { @@ -47,7 +47,7 @@ impl BufferPool { } } - pub fn get_or_create(&self, desc: &BufferDescriptor, device: &WgpuDevice) -> GPUBuffer { + pub fn get_or_create(&self, desc: &BufferDescriptor, device: &WgpuDevice) -> PooledGPUBuffer { self.inner.get_or_create(desc, |desc| { let (size, usage, mapped_at_creation) = desc.fields(); device.create_buffer(&wgpu::BufferDescriptor { @@ -64,7 +64,7 @@ impl BufferPool { } /// Method to retrieve a resource from a weak handle (used by [`super::GpuBindGroupPool`]) - pub fn get(&self, handle: GpuBufferHandle) -> Result { + pub fn get(&self, handle: GpuBufferHandle) -> Result { self.inner.get_from_handle(handle) } diff --git a/crates/ratchet-core/src/gpu/uniform.rs b/crates/ratchet-core/src/gpu/uniform.rs index c2e733a0..dbe4b2c5 100644 --- a/crates/ratchet-core/src/gpu/uniform.rs +++ b/crates/ratchet-core/src/gpu/uniform.rs @@ -5,7 +5,7 @@ use crate::{ rvec, }; -use super::{BindGroupDescriptor, GPUBuffer, GpuBindGroup, WgpuDevice}; +use super::{BindGroupDescriptor, GpuBindGroup, PooledGPUBuffer, WgpuDevice}; use encase::DynamicUniformBuffer; ///We use a single uniform buffer for all operations to hold their parameters. @@ -56,7 +56,7 @@ impl CpuUniform { } pub struct GpuUniform { - buf: GPUBuffer, + buf: PooledGPUBuffer, bind_group: GpuBindGroup, } diff --git a/crates/ratchet-core/src/lib.rs b/crates/ratchet-core/src/lib.rs index 6eecbfde..4ed8252c 100644 --- a/crates/ratchet-core/src/lib.rs +++ b/crates/ratchet-core/src/lib.rs @@ -33,6 +33,7 @@ pub use tensor_id::*; use smallvec::SmallVec; pub type RVec = SmallVec<[T; 4]>; pub type DRVec = SmallVec<[T; 8]>; //Double RVec +pub type RawGPUBuffer = wgpu::Buffer; //https://github.com/sonos/tract/blob/main/data/src/macros.rs#L2 #[macro_export] diff --git a/crates/ratchet-core/src/storage/cpu_buffer.rs b/crates/ratchet-core/src/storage/cpu_buffer.rs index b63147a7..1a4b3d21 100644 --- a/crates/ratchet-core/src/storage/cpu_buffer.rs +++ b/crates/ratchet-core/src/storage/cpu_buffer.rs @@ -1,26 +1,66 @@ use bytemuck::NoUninit; -use crate::{ - storage::{DeviceStorage, RawGPUBuffer}, - Device, DeviceError, Shape, TensorDType, -}; +use crate::{storage::DeviceStorage, Device, DeviceError, GPUBuffer, Shape, TensorDType}; -use std::{alloc::Layout, fmt::Debug}; +use std::{alloc::Layout, fmt::Debug, sync::Arc}; use crate::DType; #[derive(derive_new::new, Debug, PartialEq, Eq)] pub struct RawCPUBuffer(*mut u8, Layout); -unsafe impl Send for RawCPUBuffer {} - impl RawCPUBuffer { + pub fn into_raw_parts(&self) -> (*mut u8, Layout) { + (self.0, self.1) + } + + pub fn n_bytes(&self) -> usize { + self.1.size() + } + + pub fn as_bytes(&self) -> &[u8] { + unsafe { std::slice::from_raw_parts(self.0, self.1.size()) } + } +} + +impl Clone for RawCPUBuffer { + fn clone(&self) -> Self { + let (ptr, layout) = self.into_raw_parts(); + let alloc = unsafe { std::alloc::alloc(layout) }; + unsafe { ptr.copy_to_nonoverlapping(alloc, layout.size()) }; + + Self(alloc, layout) + } +} + +impl Drop for RawCPUBuffer { + fn drop(&mut self) { + if !self.0.is_null() && self.1.size() > 0 { + unsafe { std::alloc::dealloc(self.0, self.1) } + } + } +} + +/// Managed CPU buffer +#[derive(Debug, Clone, derive_new::new)] +pub struct CPUBuffer { + inner: Arc, +} + +unsafe impl Send for CPUBuffer {} +unsafe impl Sync for CPUBuffer {} + +impl CPUBuffer { pub fn from_slice(data: &[T], shape: &Shape) -> Self { assert_eq!(data.len(), shape.numel()); let bytes: &[u8] = bytemuck::cast_slice(data); Self::from_bytes(bytes, std::mem::align_of::()) } + pub fn inner(&self) -> &Arc { + &self.inner + } + unsafe fn uninitialized(size: usize, alignment: usize) -> Self { let layout = std::alloc::Layout::from_size_align(size, alignment).unwrap(); let data = if size == 0 { @@ -30,62 +70,56 @@ impl RawCPUBuffer { assert!(!ptr.is_null()); ptr } as *mut u8; - Self(data, layout) + Self::from_raw_parts(data, layout) } - pub fn inner(&self) -> (*mut u8, Layout) { - (self.0, self.1) - } - - pub fn as_bytes_mut(&mut self) -> &mut [u8] { - unsafe { std::slice::from_raw_parts_mut(self.0, self.1.size()) } - } - - pub fn as_bytes(&self) -> &[u8] { - unsafe { std::slice::from_raw_parts(self.0, self.1.size()) } + pub fn from_raw_parts(data: *mut u8, layout: Layout) -> Self { + Self { + inner: Arc::new(RawCPUBuffer(data, layout)), + } } pub fn from_bytes(bytes: &[u8], alignment: usize) -> Self { - let mut storage = unsafe { Self::uninitialized(bytes.len(), alignment) }; - storage.as_bytes_mut().copy_from_slice(bytes); - storage + let layout = std::alloc::Layout::from_size_align(bytes.len(), alignment).unwrap(); + let data = if bytes.len() == 0 { + std::ptr::null() + } else { + let ptr = unsafe { std::alloc::alloc(layout) }; + assert!(!ptr.is_null()); + unsafe { ptr.copy_from_nonoverlapping(bytes.as_ptr(), bytes.len()) }; + ptr + } as *mut u8; + Self::from_raw_parts(data, layout) } -} -impl Clone for RawCPUBuffer { - fn clone(&self) -> Self { - let (ptr, layout) = self.inner(); + pub fn deep_clone(&self) -> Self { + let (ptr, layout) = self.inner().into_raw_parts(); let alloc = unsafe { std::alloc::alloc(layout) }; unsafe { ptr.copy_to_nonoverlapping(alloc, layout.size()) }; - Self(alloc, layout) + Self::from_raw_parts(alloc, layout) } } -impl Drop for RawCPUBuffer { - fn drop(&mut self) { - if !self.0.is_null() && self.1.size() > 0 { - unsafe { std::alloc::dealloc(self.0, self.1) } - } +impl DeviceStorage for CPUBuffer { + fn to_device(&self, device: &Device) -> Result { + let gpu_device = device.try_gpu()?; + let raw = self.inner(); + let (ptr, layout) = raw.into_raw_parts(); + let bytes = unsafe { std::slice::from_raw_parts(ptr, layout.size()) }; + Ok(GPUBuffer::from_bytes(bytes, layout.align(), gpu_device)) } -} -impl DeviceStorage for RawCPUBuffer { - fn to_device(self, device: &Device) -> Result { - let (bytes, align, gpu_device) = (self.as_bytes(), self.1.align(), device.try_gpu()?); - Ok(RawGPUBuffer::from_bytes(bytes, align, gpu_device)) - } - - fn to_cpu(&self, _device: &Device) -> Result { + fn to_cpu(&self, _device: &Device) -> Result { Ok(self.clone()) } fn n_bytes(&self) -> usize { - self.1.size() + self.inner().n_bytes() } fn dump(&self, dtype: DType, full: bool) -> String { - let bytes = unsafe { std::slice::from_raw_parts(self.0, self.1.size()) }; + let bytes = self.inner().as_bytes(); fn dump_inner(data: &[T], full: bool) -> String { let length = if data.len() < 64 { data.len() } else { 64 }; diff --git a/crates/ratchet-core/src/storage/gpu_buffer.rs b/crates/ratchet-core/src/storage/gpu_buffer.rs index a5baedc7..c42309c8 100644 --- a/crates/ratchet-core/src/storage/gpu_buffer.rs +++ b/crates/ratchet-core/src/storage/gpu_buffer.rs @@ -1,8 +1,8 @@ use crate::{ gpu::{BufferDescriptor, WgpuDevice}, - gpu::{BufferUsagesExt, GPUBuffer}, - storage::{DeviceStorage, RawCPUBuffer}, - Device, DeviceError, Shape, TensorError, + gpu::{BufferUsagesExt, PooledGPUBuffer}, + storage::{CPUBuffer, DeviceStorage}, + Device, DeviceError, Shape, }; use bytemuck::NoUninit; @@ -10,13 +10,13 @@ use wgpu::BufferUsages; use crate::DType; -#[derive(Clone, derive_new::new)] -pub struct RawGPUBuffer { - pub(crate) inner: GPUBuffer, +#[derive(Clone, Debug, derive_new::new)] +pub struct GPUBuffer { + pub(crate) inner: PooledGPUBuffer, pub(crate) alignment: usize, } -impl RawGPUBuffer { +impl GPUBuffer { const MIN_SIZE: usize = 16; pub fn from_slice(data: &[T], shape: &Shape, device: &WgpuDevice) -> Self { @@ -37,7 +37,7 @@ impl RawGPUBuffer { } else { bytes }; - let buffer = device + let inner = device .create_buffer_init( &BufferDescriptor::new(bytes.len() as _, BufferUsages::standard(), false), bytes, @@ -45,10 +45,7 @@ impl RawGPUBuffer { .unwrap(); device.queue().submit(None); device.poll(wgpu::Maintain::Wait); - Self { - inner: buffer, - alignment, - } + Self { inner, alignment } } /// Returns true if the buffer has all the given usages. @@ -59,7 +56,7 @@ impl RawGPUBuffer { } } - pub fn inner(&self) -> &GPUBuffer { + pub fn inner(&self) -> &PooledGPUBuffer { &self.inner } @@ -68,26 +65,12 @@ impl RawGPUBuffer { } } -impl std::fmt::Debug for RawGPUBuffer { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("RawGPUBuffer") - .field("buf", &self.inner.global_id()) - .finish() - } -} - -impl PartialEq for RawGPUBuffer { - fn eq(&self, other: &Self) -> bool { - self.inner.global_id() == other.inner.global_id() - } -} - -impl DeviceStorage for RawGPUBuffer { - fn to_device(self, _: &Device) -> Result { - Ok(self) +impl DeviceStorage for GPUBuffer { + fn to_device(&self, _: &Device) -> Result { + Ok(self.clone()) } - fn to_cpu(&self, device: &Device) -> Result { + fn to_cpu(&self, device: &Device) -> Result { self.validate_usages(BufferUsages::COPY_SRC)?; let device = device.try_gpu()?; let buffer_slice = self.inner.slice(..); @@ -100,7 +83,7 @@ impl DeviceStorage for RawGPUBuffer { &buffer_slice, move |buffer| { tx.send(match buffer { - Ok(db) => Ok(RawCPUBuffer::from_bytes(&db, alignment)), + Ok(db) => Ok(CPUBuffer::from_bytes(&db, alignment)), Err(error) => Err(error), }) .expect("Failed to send result of read_buffer"); diff --git a/crates/ratchet-core/src/storage/mod.rs b/crates/ratchet-core/src/storage/mod.rs index f998dc16..d83cbeac 100644 --- a/crates/ratchet-core/src/storage/mod.rs +++ b/crates/ratchet-core/src/storage/mod.rs @@ -5,116 +5,58 @@ use bytemuck::NoUninit; pub use cpu_buffer::*; pub use gpu_buffer::*; -use crate::{gpu::GPUBuffer, Device, DeviceError, Shape}; +use crate::{Device, DeviceError, Shape}; use crate::DType; #[derive(Debug)] -pub struct Storage { - raw: Option, //Optional as the tensor may not be resolved +pub enum Storage { + CPU(CPUBuffer), + GPU(GPUBuffer), } -unsafe impl Send for Storage {} -unsafe impl Sync for Storage {} - impl Storage { - pub fn empty() -> Self { - Self { raw: None } - } - pub fn from_slice(data: &[T], shape: &Shape, device: &Device) -> Self { - assert_eq!(data.len(), shape.numel()); match device { - Device::CPU => Self { - raw: Some(RawStorage::CPU(RawCPUBuffer::from_slice(data, shape))), - }, - Device::GPU(d) => Self { - raw: Some(RawStorage::GPU(RawGPUBuffer::from_slice(data, shape, d))), - }, + Device::CPU => Storage::CPU(CPUBuffer::from_slice(data, shape)), + Device::GPU(g) => Storage::GPU(GPUBuffer::from_slice(data, shape, g)), } } - pub fn set_raw(&mut self, raw: RawStorage) { - self.raw = Some(raw); - } - - pub fn raw(&self) -> Option<&RawStorage> { - self.raw.as_ref() - } - - pub fn try_gpu(&self) -> Result<&RawGPUBuffer, DeviceError> { - match self.raw.as_ref() { - Some(RawStorage::GPU(raw)) => Ok(raw), - _ => Err(DeviceError::DeviceMismatch( - "GPU".to_string(), - "CPU".to_string(), - )), + pub fn dump(&self, dt: DType, full: bool) -> String { + match self { + Storage::CPU(c) => c.dump(dt, full), + Storage::GPU(g) => g.dump(dt, full), } } - pub fn try_cpu(&self) -> Result<&RawCPUBuffer, DeviceError> { - match self.raw.as_ref() { - Some(RawStorage::CPU(raw)) => Ok(raw), - _ => Err(DeviceError::DeviceMismatch( - "CPU".to_string(), - "GPU".to_string(), - )), + pub fn try_cpu(&self) -> Result<&CPUBuffer, DeviceError> { + match self { + Storage::CPU(c) => Ok(c), + _ => unimplemented!(), } } - pub fn dump(&self, dtype: DType, full: bool) -> String { - self.raw - .as_ref() - .map(|raw| match raw { - RawStorage::CPU(raw) => raw.dump(dtype, full), - RawStorage::GPU(raw) => raw.dump(dtype, full), - }) - .unwrap_or_else(|| "None".to_string()) - } -} - -impl From for Storage { - fn from(raw: RawStorage) -> Self { - Self { raw: Some(raw) } - } -} - -impl From for Storage { - fn from(raw: RawCPUBuffer) -> Self { - Self { - raw: Some(RawStorage::CPU(raw)), + pub fn try_gpu(&self) -> Result<&GPUBuffer, DeviceError> { + match self { + Storage::GPU(g) => Ok(g), + _ => unimplemented!(), } } -} -impl From for Storage { - fn from(raw: RawGPUBuffer) -> Self { - Self { - raw: Some(RawStorage::GPU(raw)), + pub fn deep_clone(&self, _: &Device) -> Result { + match self { + Storage::CPU(buf) => Ok(Storage::CPU(buf.deep_clone())), + _ => todo!(), } } } -#[derive(Debug)] -pub enum RawStorage { - CPU(RawCPUBuffer), - GPU(RawGPUBuffer), -} - -impl RawStorage { - pub fn from_gpu(buf: GPUBuffer, dtype: DType) -> Self { - RawStorage::GPU(RawGPUBuffer { - inner: buf, - alignment: dtype.size_of(), - }) - } -} - pub trait DeviceStorage: std::fmt::Debug + Clone + 'static { // To be expanded to other devices - fn to_device(self, device: &Device) -> Result; + fn to_device(&self, device: &Device) -> Result; /// Creates a copy of the device buffer on the CPU - fn to_cpu(&self, device: &Device) -> Result; + fn to_cpu(&self, device: &Device) -> Result; fn n_bytes(&self) -> usize; fn dump(&self, dt: DType, full: bool) -> String; } diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index 683e5dce..478c66b2 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -1,12 +1,12 @@ use crate::gpu::{CpuUniform, WgpuDevice}; use crate::{ - ops::*, CompiledOp, DType, Device, DeviceStorage, Executable, Operation, OperationError, - RawCPUBuffer, RawStorage, Shape, Storage, Strides, TensorDType, TensorId, + ops::*, CPUBuffer, CompiledOp, DType, Device, DeviceStorage, Executable, GPUBuffer, Operation, + OperationError, Shape, Storage, Strides, TensorDType, TensorId, }; use crate::{BinaryOp, LazyOp}; use derive_new::new; -use parking_lot::RwLock; +use parking_lot::{RwLock, RwLockReadGuard}; use std::sync::Arc; #[cfg(feature = "rand")] @@ -43,21 +43,24 @@ pub struct Tensor { } impl Tensor { - fn new(op: LazyOp, meta: StorageView, storage: Storage, device: Device) -> Self { + fn new(op: LazyOp, meta: StorageView, storage: Option, device: Device) -> Self { Self { inner: Arc::new(Inner::new(op, meta, storage, device)), } } fn lazy(op: LazyOp, meta: StorageView, device: Device) -> Self { - Self::new(op, meta, Storage::empty(), device) + Self::new(op, meta, None, device) + } + + fn update_storage(&self, storage: Storage) { + *self.inner.storage.write() = Some(storage); } } impl std::fmt::Debug for Tensor { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let storage = self.storage().try_read().expect("Could not read storage"); - let storage_fmt = storage.dump(self.dt(), false); + let storage_fmt = self.storage().as_ref().map(|s| s.dump(self.dt(), false)); let (id, op) = (self.id(), self.op()); f.debug_struct("Tensor") .field("id", &id) @@ -94,7 +97,7 @@ pub struct Inner { op: LazyOp, device: Device, view: StorageView, - storage: Arc>, + storage: Arc>>, } impl AsRef for Inner { @@ -104,7 +107,7 @@ impl AsRef for Inner { } impl Inner { - fn new(op: LazyOp, meta: StorageView, storage: Storage, device: Device) -> Self { + fn new(op: LazyOp, meta: StorageView, storage: Option, device: Device) -> Self { Self { id: TensorId::new(), view: meta, @@ -144,19 +147,12 @@ impl Tensor { &self.device } - pub fn storage(&self) -> &Arc> { - &self.storage + pub fn storage(&self) -> RwLockReadGuard> { + self.inner.storage.read() } pub fn resolved(&self) -> bool { - self.storage().try_read().unwrap().raw().is_some() - } - - /// # Safety - /// - /// Make sure your device & storage are compatible. - pub(crate) unsafe fn set_storage(&self, storage: Storage) { - *self.storage().write() = storage; + self.storage().is_some() } pub(crate) fn op(&self) -> &LazyOp { @@ -214,7 +210,7 @@ impl Tensor { let storage = Storage::from_slice(data.as_ref(), &shape, &device); let strides = Strides::from(&shape); let meta = StorageView::new(shape, T::dt(), strides); - Tensor::new(LazyOp::Const, meta, storage, device) + Tensor::new(LazyOp::Const, meta, Some(storage), device) } fn execution_order(&self) -> Vec { @@ -264,11 +260,14 @@ impl Tensor { for t in execution_order { if !t.resolved() { let id = t.id(); - let gpu_buf = allocations.get(&id).ok_or(TensorError::NoStorage(id))?; + let pooled_buffer = allocations.get(&id).ok_or(TensorError::NoStorage(id))?; assert!(t.device().is_gpu()); - unsafe { - t.set_storage(Storage::from(RawStorage::from_gpu(gpu_buf.clone(), t.dt()))); - } + + let storage = Storage::GPU(GPUBuffer { + inner: pooled_buffer.clone(), + alignment: t.dt().size_of(), + }); + t.update_storage(storage); } if let Some(compiled_op) = t.compile(&mut uniform, device) { @@ -281,36 +280,69 @@ impl Tensor { Ok(()) } - async fn to_cpu(&self) -> Result { - let raw_gpu_buf = { - let storage_resource = self.storage().try_read().ok_or(TensorError::NotResolved)?; - storage_resource.try_gpu()?.clone() + fn to_cpu(&self) -> Result { + if self.device().is_cpu() || !self.resolved() { + return Ok(self.clone()); + } + let storage_guard = self.storage(); + let storage = storage_guard.as_ref().unwrap(); + let gpu_buf = match storage { + Storage::GPU(g) => g, + _ => unreachable!(), }; + let cpu_buf = gpu_buf.to_cpu(&self.device)?; + Ok(Tensor::new( LazyOp::Const, self.view.clone(), - Storage::from(raw_gpu_buf.to_cpu(self.device())?), + Some(Storage::CPU(cpu_buf)), Device::CPU, )) } + fn to_gpu(&self, dst_device: &Device) -> Result { + if self.device().is_gpu() || !self.resolved() { + return Ok(self.clone()); + } + let storage_guard = self.storage(); + let storage = storage_guard.as_ref().unwrap(); + let cpu_buf = match storage { + Storage::CPU(g) => g, + _ => unreachable!(), + }; + let gpu_buf = cpu_buf.to_device(dst_device)?; + + let wgpu_device = dst_device.try_gpu()?; + Ok(Tensor::new( + LazyOp::Const, + self.view.clone(), + Some(Storage::GPU(gpu_buf)), + Device::GPU(wgpu_device.clone()), + )) + } + + /// Transfers the tensor to the specified device. + /// + /// If the tensor is already on the specified device, it will be returned as-is, + /// and the underlying storage will not be copied. + /// If the tensor is on a different device, it will be copied to the specified device. pub fn to(&self, device: Device) -> Result { - match (self.device(), device) { - (Device::GPU(_), Device::CPU) => pollster::block_on(self.to_cpu()), - (Device::CPU, Device::GPU(_)) => todo!(), + match (self.device(), &device) { + (Device::GPU(_), Device::CPU) => self.to_cpu(), + (Device::CPU, Device::GPU(_)) => self.to_gpu(&device), _ => Ok(self.clone()), } } #[cfg(feature = "pyo3")] - pub fn into_ndarray(&self) -> ArrayD { + pub fn into_ndarray(self) -> ArrayD { assert!(self.device().is_cpu()); - let storage = self.storage().try_read().unwrap(); - let raw_cpu = storage.try_cpu().unwrap(); let shape = self.shape().to_vec(); if self.num_bytes() != 0 { - let ptr = raw_cpu.inner().0 as *const T; - unsafe { ArrayViewD::from_shape_ptr(shape, ptr).to_owned() } + let storage_guard = self.storage(); + let buffer = storage_guard.as_ref().unwrap().try_cpu().unwrap(); + let (ptr, _) = buffer.inner().into_raw_parts(); + unsafe { ArrayViewD::from_shape_ptr(shape, ptr as *const T).to_owned() } } else { ArrayViewD::from_shape(shape, &[]).unwrap().to_owned() } @@ -322,7 +354,19 @@ impl Tensor { py: &'p pyo3::Python<'p>, ) -> &PyArrayDyn { use numpy::PyArray; - PyArray::from_owned_array(*py, self.clone().into_ndarray::()) + PyArray::from_owned_array(*py, self.deep_clone().into_ndarray::()) + } + + pub fn deep_clone(&self) -> Tensor { + let storage_guard = self.storage(); + let storage = storage_guard.as_ref().unwrap(); + let cloned_storage = storage.deep_clone(self.device()).unwrap(); + Tensor::new( + LazyOp::Const, + self.view.clone(), + Some(cloned_storage), + self.device.clone(), + ) } } @@ -338,13 +382,16 @@ impl From> for Tensor { let shape = it.shape().to_vec().into(); let strides = Strides::from(&shape); let vec = it.into_raw_vec().into_boxed_slice(); - //This is causing a double free let ptr = Box::into_raw(vec) as *mut u8; - let raw_buf = RawCPUBuffer::new(ptr, layout); - let storage = Storage::from(RawStorage::CPU(raw_buf)); + let cpu_buf = CPUBuffer::from_raw_parts(ptr, layout); let meta = StorageView::new(shape, T::dt(), strides); - Tensor::new(LazyOp::Const, meta, storage, Device::CPU) + Tensor::new( + LazyOp::Const, + meta, + Some(Storage::CPU(cpu_buf)), + Device::CPU, + ) } else { panic!("Cannot convert numpy array with non-contiguous memory layout to tensor"); } @@ -383,10 +430,20 @@ mod tests { #[test] fn test_pyo3() -> anyhow::Result<()> { - let device = Device::request_device(DeviceRequest::CPU)?; - let a = Tensor::randn::(shape![1024, 1024], device.clone()); - let b = Tensor::randn::(shape![1024, 1024], device.clone()); - let c: anyhow::Result = Python::with_gil(|py| { + let cpu_device = Device::request_device(DeviceRequest::CPU)?; + let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + + let gpu_device = Device::request_device(DeviceRequest::GPU)?; + let a = a.to(gpu_device.clone())?; + let b = b.to(gpu_device)?; + + let c = a.matmul(&b)?; + c.resolve()?; + + let our_result = c.to(cpu_device)?; + + let ground: anyhow::Result = Python::with_gil(|py| { let prg = PyModule::from_code( py, r#" @@ -405,7 +462,8 @@ def matmul(a, b): .extract::<&PyArrayDyn>()?; Ok(Tensor::from(result)) }); - println!("\nTORCH: {:#?}", c); + println!("\nTORCH: {:#?}", ground); + println!("\nOURS: {:#?}", our_result); Ok(()) } From cd31192df8d7c9650bb6e8897bb00f5979f2d1fc Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Sun, 21 Jan 2024 16:49:01 +0000 Subject: [PATCH 08/37] chore: not terrible --- .../ratchet-core/src/gpu/buffer_allocator.rs | 2 +- crates/ratchet-core/src/gpu/device.rs | 7 ++-- crates/ratchet-core/src/storage/gpu_buffer.rs | 22 +++++++++++- crates/ratchet-core/src/storage/mod.rs | 7 ++-- crates/ratchet-core/src/tensor.rs | 36 +++++++++---------- 5 files changed, 49 insertions(+), 25 deletions(-) diff --git a/crates/ratchet-core/src/gpu/buffer_allocator.rs b/crates/ratchet-core/src/gpu/buffer_allocator.rs index e6017fb2..a6af07c4 100644 --- a/crates/ratchet-core/src/gpu/buffer_allocator.rs +++ b/crates/ratchet-core/src/gpu/buffer_allocator.rs @@ -158,7 +158,7 @@ impl BufferAllocator { let output = execution_order.last().unwrap(); assignments.insert( output.id(), - device.allocate_buffer(&BufferDescriptor { + device.get_or_create_buffer(&BufferDescriptor { size: output.num_bytes() as _, usage: BufferUsages::standard(), mapped_at_creation: false, diff --git a/crates/ratchet-core/src/gpu/device.rs b/crates/ratchet-core/src/gpu/device.rs index a5fa847c..04bfb0f6 100644 --- a/crates/ratchet-core/src/gpu/device.rs +++ b/crates/ratchet-core/src/gpu/device.rs @@ -147,7 +147,7 @@ impl WgpuDevice { } impl WgpuDevice { - pub fn create_buffer_init( + pub fn get_or_create_buffer_init( &self, desc: &BufferDescriptor, contents: &[u8], @@ -161,7 +161,10 @@ impl WgpuDevice { self.buffer_allocator.create_uniform_init(cpu_uniform, self) } - pub fn allocate_buffer(&self, desc: &BufferDescriptor) -> Result { + pub fn get_or_create_buffer( + &self, + desc: &BufferDescriptor, + ) -> Result { Ok(self.buffer_allocator.create_buffer(desc, self)) } diff --git a/crates/ratchet-core/src/storage/gpu_buffer.rs b/crates/ratchet-core/src/storage/gpu_buffer.rs index c42309c8..cd268568 100644 --- a/crates/ratchet-core/src/storage/gpu_buffer.rs +++ b/crates/ratchet-core/src/storage/gpu_buffer.rs @@ -38,7 +38,7 @@ impl GPUBuffer { bytes }; let inner = device - .create_buffer_init( + .get_or_create_buffer_init( &BufferDescriptor::new(bytes.len() as _, BufferUsages::standard(), false), bytes, ) @@ -63,6 +63,26 @@ impl GPUBuffer { pub fn usage(&self) -> BufferUsages { self.inner.usage() } + + pub fn deep_clone(&self, device: &WgpuDevice) -> Self { + //Here we need to create a buffer just like ours + let clone = device + .get_or_create_buffer(&BufferDescriptor::new( + self.inner.size(), + self.inner.usage(), + false, + )) + .unwrap(); + let mut encoder = + device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None }); + encoder.copy_buffer_to_buffer(&self.inner, 0, &clone, 0, self.inner.size()); + device.queue().submit(Some(encoder.finish())); + device.poll(wgpu::Maintain::Wait); + Self { + inner: clone, + alignment: self.alignment, + } + } } impl DeviceStorage for GPUBuffer { diff --git a/crates/ratchet-core/src/storage/mod.rs b/crates/ratchet-core/src/storage/mod.rs index d83cbeac..921a6c20 100644 --- a/crates/ratchet-core/src/storage/mod.rs +++ b/crates/ratchet-core/src/storage/mod.rs @@ -44,10 +44,13 @@ impl Storage { } } - pub fn deep_clone(&self, _: &Device) -> Result { + pub fn deep_clone(&self, device: &Device) -> Result { match self { Storage::CPU(buf) => Ok(Storage::CPU(buf.deep_clone())), - _ => todo!(), + Storage::GPU(buf) => { + let gpu_device = device.try_gpu()?; + Ok(Storage::GPU(buf.deep_clone(gpu_device))) + } } } } diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index 478c66b2..12db2744 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -285,11 +285,10 @@ impl Tensor { return Ok(self.clone()); } let storage_guard = self.storage(); - let storage = storage_guard.as_ref().unwrap(); - let gpu_buf = match storage { - Storage::GPU(g) => g, - _ => unreachable!(), - }; + let gpu_buf = storage_guard + .as_ref() + .ok_or(TensorError::TransferError)? + .try_gpu()?; let cpu_buf = gpu_buf.to_cpu(&self.device)?; Ok(Tensor::new( @@ -305,11 +304,10 @@ impl Tensor { return Ok(self.clone()); } let storage_guard = self.storage(); - let storage = storage_guard.as_ref().unwrap(); - let cpu_buf = match storage { - Storage::CPU(g) => g, - _ => unreachable!(), - }; + let cpu_buf = storage_guard + .as_ref() + .ok_or(TensorError::TransferError)? + .try_cpu()?; let gpu_buf = cpu_buf.to_device(dst_device)?; let wgpu_device = dst_device.try_gpu()?; @@ -434,15 +432,6 @@ mod tests { let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); - let gpu_device = Device::request_device(DeviceRequest::GPU)?; - let a = a.to(gpu_device.clone())?; - let b = b.to(gpu_device)?; - - let c = a.matmul(&b)?; - c.resolve()?; - - let our_result = c.to(cpu_device)?; - let ground: anyhow::Result = Python::with_gil(|py| { let prg = PyModule::from_code( py, @@ -462,6 +451,15 @@ def matmul(a, b): .extract::<&PyArrayDyn>()?; Ok(Tensor::from(result)) }); + + let gpu_device = Device::request_device(DeviceRequest::GPU)?; + let a = a.to(gpu_device.clone())?; + let b = b.to(gpu_device)?; + + let c = a.matmul(&b)?; + c.resolve()?; + + let our_result = c.to(cpu_device)?; println!("\nTORCH: {:#?}", ground); println!("\nOURS: {:#?}", our_result); From 1859f9e902e388b933b17e9f527839277663a328 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Sun, 21 Jan 2024 17:24:50 +0000 Subject: [PATCH 09/37] chore: check test --- crates/ratchet-core/src/tensor.rs | 80 ++++++++++++++++--------------- 1 file changed, 41 insertions(+), 39 deletions(-) diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index 12db2744..3d3ca2d3 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -426,43 +426,45 @@ mod tests { Ok(()) } - #[test] - fn test_pyo3() -> anyhow::Result<()> { - let cpu_device = Device::request_device(DeviceRequest::CPU)?; - let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); - let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); - - let ground: anyhow::Result = Python::with_gil(|py| { - let prg = PyModule::from_code( - py, - r#" -import torch - -def matmul(a, b): - return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() -"#, - "x.py", - "x", - )?; - - let result = prg - .getattr("matmul")? - .call1((a.to_py::(&py), b.to_py::(&py)))? - .extract::<&PyArrayDyn>()?; - Ok(Tensor::from(result)) - }); - - let gpu_device = Device::request_device(DeviceRequest::GPU)?; - let a = a.to(gpu_device.clone())?; - let b = b.to(gpu_device)?; - - let c = a.matmul(&b)?; - c.resolve()?; - - let our_result = c.to(cpu_device)?; - println!("\nTORCH: {:#?}", ground); - println!("\nOURS: {:#?}", our_result); - - Ok(()) - } + /* + #[test] + fn test_pyo3() -> anyhow::Result<()> { + let cpu_device = Device::request_device(DeviceRequest::CPU)?; + let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + + let ground: anyhow::Result = Python::with_gil(|py| { + let prg = PyModule::from_code( + py, + r#" + import torch + + def matmul(a, b): + return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() + "#, + "x.py", + "x", + )?; + + let result = prg + .getattr("matmul")? + .call1((a.to_py::(&py), b.to_py::(&py)))? + .extract::<&PyArrayDyn>()?; + Ok(Tensor::from(result)) + }); + + let gpu_device = Device::request_device(DeviceRequest::GPU)?; + let a = a.to(gpu_device.clone())?; + let b = b.to(gpu_device)?; + + let c = a.matmul(&b)?; + c.resolve()?; + + let our_result = c.to(cpu_device)?; + println!("\nTORCH: {:#?}", ground); + println!("\nOURS: {:#?}", our_result); + + Ok(()) + } + */ } From 21cbe5b2f8ee64785c962741fca9f0679aa6fd70 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Sun, 21 Jan 2024 17:39:13 +0000 Subject: [PATCH 10/37] chore: still freeing somewhere --- .github/workflows/rust.yml | 2 +- crates/ratchet-core/src/storage/cpu_buffer.rs | 2 + crates/ratchet-core/src/tensor.rs | 72 ++++++++++--------- 3 files changed, 40 insertions(+), 36 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 7c62f447..3f36e9f6 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -46,6 +46,6 @@ jobs: - name: Build run: cargo build - name: Run tests - run: cargo test + run: cargo test -- --nocapture - name: Run integration tests run: (cd crates/ratchet-integration-tests;sh run-tests.sh) diff --git a/crates/ratchet-core/src/storage/cpu_buffer.rs b/crates/ratchet-core/src/storage/cpu_buffer.rs index 1a4b3d21..5feecce7 100644 --- a/crates/ratchet-core/src/storage/cpu_buffer.rs +++ b/crates/ratchet-core/src/storage/cpu_buffer.rs @@ -94,8 +94,10 @@ impl CPUBuffer { pub fn deep_clone(&self) -> Self { let (ptr, layout) = self.inner().into_raw_parts(); + println!("before deep clone: {:p}", ptr); let alloc = unsafe { std::alloc::alloc(layout) }; unsafe { ptr.copy_to_nonoverlapping(alloc, layout.size()) }; + println!("after deep clone: {:p}", alloc); Self::from_raw_parts(alloc, layout) } diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index 3d3ca2d3..40cb2978 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -340,6 +340,7 @@ impl Tensor { let storage_guard = self.storage(); let buffer = storage_guard.as_ref().unwrap().try_cpu().unwrap(); let (ptr, _) = buffer.inner().into_raw_parts(); + println!("INTO NDARRAY: {:?}", ptr); unsafe { ArrayViewD::from_shape_ptr(shape, ptr as *const T).to_owned() } } else { ArrayViewD::from_shape(shape, &[]).unwrap().to_owned() @@ -426,45 +427,46 @@ mod tests { Ok(()) } - /* - #[test] - fn test_pyo3() -> anyhow::Result<()> { - let cpu_device = Device::request_device(DeviceRequest::CPU)?; - let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); - let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); - - let ground: anyhow::Result = Python::with_gil(|py| { - let prg = PyModule::from_code( - py, - r#" - import torch - - def matmul(a, b): - return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() + #[test] + fn test_pyo3() -> anyhow::Result<()> { + let cpu_device = Device::request_device(DeviceRequest::CPU)?; + let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + + let ground: anyhow::Result = Python::with_gil(|py| { + let prg = PyModule::from_code( + py, + r#" +import torch + +def matmul(a, b): + return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() "#, - "x.py", - "x", - )?; + "x.py", + "x", + )?; + + let result = prg + .getattr("matmul")? + .call1((a.to_py::(&py), b.to_py::(&py)))? + .extract::<&PyArrayDyn>()?; + Ok(Tensor::from(result)) + }); + println!("\nTORCH: {:#?}", ground); - let result = prg - .getattr("matmul")? - .call1((a.to_py::(&py), b.to_py::(&py)))? - .extract::<&PyArrayDyn>()?; - Ok(Tensor::from(result)) - }); + println!("\nA: {:#?}", a); + println!("\nB: {:#?}", b); - let gpu_device = Device::request_device(DeviceRequest::GPU)?; - let a = a.to(gpu_device.clone())?; - let b = b.to(gpu_device)?; + let gpu_device = Device::request_device(DeviceRequest::GPU)?; + let a = a.to(gpu_device.clone())?; + let b = b.to(gpu_device)?; - let c = a.matmul(&b)?; - c.resolve()?; + let c = a.matmul(&b)?; + c.resolve()?; - let our_result = c.to(cpu_device)?; - println!("\nTORCH: {:#?}", ground); - println!("\nOURS: {:#?}", our_result); + let our_result = c.to(cpu_device)?; + println!("\nOURS: {:#?}", our_result); - Ok(()) - } - */ + Ok(()) + } } From 4daf1f4daa96e50b479e23bde75379b9ffa658fb Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Sun, 21 Jan 2024 17:49:10 +0000 Subject: [PATCH 11/37] chore: very confusing --- crates/ratchet-core/src/tensor.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index 40cb2978..d5a5c2fd 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -353,7 +353,10 @@ impl Tensor { py: &'p pyo3::Python<'p>, ) -> &PyArrayDyn { use numpy::PyArray; - PyArray::from_owned_array(*py, self.deep_clone().into_ndarray::()) + println!("TO PY: {:?}", self); + let cloned = self.deep_clone(); + println!("CLONED: {:?}", cloned); + PyArray::from_owned_array(*py, cloned.into_ndarray::()) } pub fn deep_clone(&self) -> Tensor { From f0fbc7ef9fe36a5160ca6eac26f2c462c5777eac Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Sun, 21 Jan 2024 17:59:46 +0000 Subject: [PATCH 12/37] chore: check allocator bug --- crates/ratchet-core/src/tensor.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index d5a5c2fd..25f989b7 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -340,7 +340,7 @@ impl Tensor { let storage_guard = self.storage(); let buffer = storage_guard.as_ref().unwrap().try_cpu().unwrap(); let (ptr, _) = buffer.inner().into_raw_parts(); - println!("INTO NDARRAY: {:?}", ptr); + println!("POINTER PASSED TO NDARRAY: {:?}", ptr); unsafe { ArrayViewD::from_shape_ptr(shape, ptr as *const T).to_owned() } } else { ArrayViewD::from_shape(shape, &[]).unwrap().to_owned() @@ -353,9 +353,7 @@ impl Tensor { py: &'p pyo3::Python<'p>, ) -> &PyArrayDyn { use numpy::PyArray; - println!("TO PY: {:?}", self); let cloned = self.deep_clone(); - println!("CLONED: {:?}", cloned); PyArray::from_owned_array(*py, cloned.into_ndarray::()) } @@ -433,8 +431,8 @@ mod tests { #[test] fn test_pyo3() -> anyhow::Result<()> { let cpu_device = Device::request_device(DeviceRequest::CPU)?; - let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); - let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + let a = Tensor::randn::(shape![1024, 512], cpu_device.clone()); + let b = Tensor::randn::(shape![512, 384], cpu_device.clone()); let ground: anyhow::Result = Python::with_gil(|py| { let prg = PyModule::from_code( From 14a4a1e0b870303bf35b5909b34977b8cdf5d335 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Sun, 21 Jan 2024 18:07:52 +0000 Subject: [PATCH 13/37] chore: print drop --- crates/ratchet-core/src/storage/cpu_buffer.rs | 1 + crates/ratchet-core/src/tensor.rs | 2 ++ 2 files changed, 3 insertions(+) diff --git a/crates/ratchet-core/src/storage/cpu_buffer.rs b/crates/ratchet-core/src/storage/cpu_buffer.rs index 5feecce7..a9e0d606 100644 --- a/crates/ratchet-core/src/storage/cpu_buffer.rs +++ b/crates/ratchet-core/src/storage/cpu_buffer.rs @@ -36,6 +36,7 @@ impl Clone for RawCPUBuffer { impl Drop for RawCPUBuffer { fn drop(&mut self) { if !self.0.is_null() && self.1.size() > 0 { + println!("DROPPING CPU BUFFER: {:p}", self.0); unsafe { std::alloc::dealloc(self.0, self.1) } } } diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index 25f989b7..9fe056c5 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -413,6 +413,7 @@ mod tests { use super::*; + /* #[test] fn test_matmul() -> anyhow::Result<()> { let device = Device::request_device(DeviceRequest::GPU)?; @@ -427,6 +428,7 @@ mod tests { println!("\nD: {:#?}", d); Ok(()) } + */ #[test] fn test_pyo3() -> anyhow::Result<()> { From 70f01c569fdc8dc00b1c2f35035187f1a493a71d Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Sun, 21 Jan 2024 18:24:40 +0000 Subject: [PATCH 14/37] chore: strange --- crates/ratchet-core/src/storage/cpu_buffer.rs | 15 ++++++++----- crates/ratchet-core/src/tensor.rs | 22 ++++++++----------- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/crates/ratchet-core/src/storage/cpu_buffer.rs b/crates/ratchet-core/src/storage/cpu_buffer.rs index a9e0d606..d7ae546b 100644 --- a/crates/ratchet-core/src/storage/cpu_buffer.rs +++ b/crates/ratchet-core/src/storage/cpu_buffer.rs @@ -25,18 +25,21 @@ impl RawCPUBuffer { impl Clone for RawCPUBuffer { fn clone(&self) -> Self { - let (ptr, layout) = self.into_raw_parts(); - let alloc = unsafe { std::alloc::alloc(layout) }; - unsafe { ptr.copy_to_nonoverlapping(alloc, layout.size()) }; - - Self(alloc, layout) + let data = if self.1.size() == 0 { + std::ptr::null() + } else { + let ptr = unsafe { std::alloc::alloc(self.1) }; + assert!(!ptr.is_null()); + ptr + } as *mut u8; + unsafe { self.0.copy_to_nonoverlapping(data, self.1.size()) }; + Self(data, self.1) } } impl Drop for RawCPUBuffer { fn drop(&mut self) { if !self.0.is_null() && self.1.size() > 0 { - println!("DROPPING CPU BUFFER: {:p}", self.0); unsafe { std::alloc::dealloc(self.0, self.1) } } } diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index 9fe056c5..7ffe2fe3 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -340,21 +340,19 @@ impl Tensor { let storage_guard = self.storage(); let buffer = storage_guard.as_ref().unwrap().try_cpu().unwrap(); let (ptr, _) = buffer.inner().into_raw_parts(); - println!("POINTER PASSED TO NDARRAY: {:?}", ptr); - unsafe { ArrayViewD::from_shape_ptr(shape, ptr as *const T).to_owned() } + unsafe { ArrayViewD::from_shape_ptr(shape, ptr as *const T).into_owned() } } else { - ArrayViewD::from_shape(shape, &[]).unwrap().to_owned() + ArrayViewD::from_shape(shape, &[]).unwrap().into_owned() } } #[cfg(feature = "pyo3")] - pub fn to_py<'s, 'p: 's, T: TensorDType + numpy::Element>( - &'s self, - py: &'p pyo3::Python<'p>, + pub fn to_py<'p, T: TensorDType + numpy::Element>( + self, + py: pyo3::Python<'p>, ) -> &PyArrayDyn { use numpy::PyArray; - let cloned = self.deep_clone(); - PyArray::from_owned_array(*py, cloned.into_ndarray::()) + PyArray::from_owned_array(py, self.into_ndarray::()) } pub fn deep_clone(&self) -> Tensor { @@ -413,7 +411,6 @@ mod tests { use super::*; - /* #[test] fn test_matmul() -> anyhow::Result<()> { let device = Device::request_device(DeviceRequest::GPU)?; @@ -428,13 +425,12 @@ mod tests { println!("\nD: {:#?}", d); Ok(()) } - */ #[test] fn test_pyo3() -> anyhow::Result<()> { let cpu_device = Device::request_device(DeviceRequest::CPU)?; - let a = Tensor::randn::(shape![1024, 512], cpu_device.clone()); - let b = Tensor::randn::(shape![512, 384], cpu_device.clone()); + let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); let ground: anyhow::Result = Python::with_gil(|py| { let prg = PyModule::from_code( @@ -451,7 +447,7 @@ def matmul(a, b): let result = prg .getattr("matmul")? - .call1((a.to_py::(&py), b.to_py::(&py)))? + .call1((a.clone().to_py::(py), b.clone().to_py::(py)))? .extract::<&PyArrayDyn>()?; Ok(Tensor::from(result)) }); From 4cf51d69438a7073d2410f471621f6f1866bed73 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Mon, 22 Jan 2024 10:26:17 +0000 Subject: [PATCH 15/37] chore: faster wasm-pack --- .github/workflows/rust.yml | 4 +- crates/ratchet-core/src/tensor.rs | 87 ++++++++++++++++--------------- 2 files changed, 47 insertions(+), 44 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 3f36e9f6..37bd93c5 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -40,9 +40,9 @@ jobs: python-version: '3.10.6' cache: 'pip' - run: pip install -r requirements.txt - - name: Setup + - name: Install wasm-pack run: | - cargo install wasm-pack + curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh - name: Build run: cargo build - name: Run tests diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index 7ffe2fe3..e51454bc 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -426,46 +426,49 @@ mod tests { Ok(()) } - #[test] - fn test_pyo3() -> anyhow::Result<()> { - let cpu_device = Device::request_device(DeviceRequest::CPU)?; - let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); - let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); - - let ground: anyhow::Result = Python::with_gil(|py| { - let prg = PyModule::from_code( - py, - r#" -import torch - -def matmul(a, b): - return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() - "#, - "x.py", - "x", - )?; - - let result = prg - .getattr("matmul")? - .call1((a.clone().to_py::(py), b.clone().to_py::(py)))? - .extract::<&PyArrayDyn>()?; - Ok(Tensor::from(result)) - }); - println!("\nTORCH: {:#?}", ground); - - println!("\nA: {:#?}", a); - println!("\nB: {:#?}", b); - - let gpu_device = Device::request_device(DeviceRequest::GPU)?; - let a = a.to(gpu_device.clone())?; - let b = b.to(gpu_device)?; - - let c = a.matmul(&b)?; - c.resolve()?; - - let our_result = c.to(cpu_device)?; - println!("\nOURS: {:#?}", our_result); - - Ok(()) - } + /* + + #[test] + fn test_pyo3() -> anyhow::Result<()> { + let cpu_device = Device::request_device(DeviceRequest::CPU)?; + let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + + let ground: anyhow::Result = Python::with_gil(|py| { + let prg = PyModule::from_code( + py, + r#" + import torch + + def matmul(a, b): + return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() + "#, + "x.py", + "x", + )?; + + let result = prg + .getattr("matmul")? + .call1((a.clone().to_py::(py), b.clone().to_py::(py)))? + .extract::<&PyArrayDyn>()?; + Ok(Tensor::from(result)) + }); + println!("\nTORCH: {:#?}", ground); + + println!("\nA: {:#?}", a); + println!("\nB: {:#?}", b); + + let gpu_device = Device::request_device(DeviceRequest::GPU)?; + let a = a.to(gpu_device.clone())?; + let b = b.to(gpu_device)?; + + let c = a.matmul(&b)?; + c.resolve()?; + + let our_result = c.to(cpu_device)?; + println!("\nOURS: {:#?}", our_result); + + Ok(()) + } + */ } From 4388c98ee91144a2036f37bf5bdf9239c3bd2355 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Mon, 22 Jan 2024 10:28:20 +0000 Subject: [PATCH 16/37] chore: downgrade wgpu --- Cargo.toml | 2 +- crates/ratchet-core/src/gpu/device.rs | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 20543cea..6b06221c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,7 @@ strip = true #debug = 2 [workspace.dependencies] -wgpu = { version = "0.19.0", features = ["fragile-send-sync-non-atomic-wasm"] } +wgpu = { version = "0.18.0", features = ["fragile-send-sync-non-atomic-wasm", "expose-ids"] } anyhow = "1.0.40" bytemuck = "1.14.0" num-traits = "0.2.17" diff --git a/crates/ratchet-core/src/gpu/device.rs b/crates/ratchet-core/src/gpu/device.rs index 04bfb0f6..4cb52286 100644 --- a/crates/ratchet-core/src/gpu/device.rs +++ b/crates/ratchet-core/src/gpu/device.rs @@ -56,7 +56,7 @@ impl WgpuDevice { let adapter = Self::select_adapter()?; #[allow(unused_mut)] - let mut required_features = wgpu::Features::default(); + let mut features = wgpu::Features::default(); #[cfg(feature = "gpu-profiling")] { features |= wgpu::Features::TIMESTAMP_QUERY; @@ -64,8 +64,8 @@ impl WgpuDevice { let mut device_descriptor = wgpu::DeviceDescriptor { label: Some("ratchet"), - required_features, - required_limits: Limits { + features, + limits: Limits { max_buffer_size: MAX_BUFFER_SIZE, max_storage_buffer_binding_size: MAX_BUFFER_SIZE as u32, ..Default::default() @@ -77,7 +77,7 @@ impl WgpuDevice { "Failed to acq. device, trying again with reduced limits: {:?}", e ); - device_descriptor.required_limits = adapter.limits(); + device_descriptor.limits = adapter.limits(); adapter.request_device(&device_descriptor, None).await } else { device_request From a90cb6322e931655c3a7cec42d4daeb9e8a65b7c Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Mon, 22 Jan 2024 10:35:42 +0000 Subject: [PATCH 17/37] chore: unused deps, add back tetst --- crates/ratchet-core/Cargo.toml | 2 - crates/ratchet-core/src/tensor.rs | 67 +++++++++++++++---------------- 2 files changed, 32 insertions(+), 37 deletions(-) diff --git a/crates/ratchet-core/Cargo.toml b/crates/ratchet-core/Cargo.toml index f2099d76..31ea049f 100644 --- a/crates/ratchet-core/Cargo.toml +++ b/crates/ratchet-core/Cargo.toml @@ -28,9 +28,7 @@ slotmap = "1.0.7" parking_lot = "0.12.1" smallvec = "1.11.2" encase = "0.7.0" -glam = "0.25.0" pollster = "0.3.0" -futures-intrusive = "0.5.0" anyhow = "1.0.79" num = "0.4.1" rand_distr = { version = "0.4.3", optional = true } diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index e51454bc..4c10b301 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -426,49 +426,46 @@ mod tests { Ok(()) } - /* - - #[test] - fn test_pyo3() -> anyhow::Result<()> { - let cpu_device = Device::request_device(DeviceRequest::CPU)?; - let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); - let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); - - let ground: anyhow::Result = Python::with_gil(|py| { - let prg = PyModule::from_code( - py, - r#" + #[test] + fn test_pyo3() -> anyhow::Result<()> { + let cpu_device = Device::request_device(DeviceRequest::CPU)?; + let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + + let ground: anyhow::Result = Python::with_gil(|py| { + let prg = PyModule::from_code( + py, + r#" import torch def matmul(a, b): return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() "#, - "x.py", - "x", - )?; - - let result = prg - .getattr("matmul")? - .call1((a.clone().to_py::(py), b.clone().to_py::(py)))? - .extract::<&PyArrayDyn>()?; - Ok(Tensor::from(result)) - }); - println!("\nTORCH: {:#?}", ground); + "x.py", + "x", + )?; + + let result = prg + .getattr("matmul")? + .call1((a.clone().to_py::(py), b.clone().to_py::(py)))? + .extract::<&PyArrayDyn>()?; + Ok(Tensor::from(result)) + }); + println!("\nTORCH: {:#?}", ground); - println!("\nA: {:#?}", a); - println!("\nB: {:#?}", b); + println!("\nA: {:#?}", a); + println!("\nB: {:#?}", b); - let gpu_device = Device::request_device(DeviceRequest::GPU)?; - let a = a.to(gpu_device.clone())?; - let b = b.to(gpu_device)?; + let gpu_device = Device::request_device(DeviceRequest::GPU)?; + let a = a.to(gpu_device.clone())?; + let b = b.to(gpu_device)?; - let c = a.matmul(&b)?; - c.resolve()?; + let c = a.matmul(&b)?; + c.resolve()?; - let our_result = c.to(cpu_device)?; - println!("\nOURS: {:#?}", our_result); + let our_result = c.to(cpu_device)?; + println!("\nOURS: {:#?}", our_result); - Ok(()) - } - */ + Ok(()) + } } From d40910ba7b757b09f9d83d348fc50b4d22bb39e6 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Mon, 22 Jan 2024 10:40:41 +0000 Subject: [PATCH 18/37] chore: tests interacting\? --- crates/ratchet-core/src/tensor.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index 4c10b301..a627f8dc 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -414,8 +414,8 @@ mod tests { #[test] fn test_matmul() -> anyhow::Result<()> { let device = Device::request_device(DeviceRequest::GPU)?; - let a = Tensor::randn::(shape![1024, 1024], device.clone()); - let b = Tensor::randn::(shape![1024, 1024], device.clone()); + let a = Tensor::randn::(shape![512, 512], device.clone()); + let b = Tensor::randn::(shape![512, 512], device.clone()); let c = a.matmul(&b)?; c.resolve()?; println!("\nA: {:#?}", a); @@ -436,10 +436,10 @@ mod tests { let prg = PyModule::from_code( py, r#" - import torch +import torch - def matmul(a, b): - return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() +def matmul(a, b): + return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() "#, "x.py", "x", From fd6e6df4f2da6a310f3d873a5e5a2ff79f818c24 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Mon, 22 Jan 2024 10:52:42 +0000 Subject: [PATCH 19/37] chore: mem management chore: confusing chore: dbg chore: dbg chore: dbg chore: dbg chore: dbg chore: dbg chore: dbg chore: dbg chore: dbg chore: remove arc chore: remove arc chore: remove arc chore: remove arc chore: remove arc chore: remove arc chore: remove arc chore: remove arc chore: remove arc --- .github/workflows/rust.yml | 8 +- Cargo.toml | 2 +- crates/ratchet-core/src/device.rs | 4 +- crates/ratchet-core/src/quant.rs | 5 -- crates/ratchet-core/src/storage/cpu_buffer.rs | 77 +++++++++--------- crates/ratchet-core/src/storage/gpu_buffer.rs | 1 - crates/ratchet-core/src/storage/mod.rs | 7 +- crates/ratchet-core/src/tensor.rs | 80 +++++++++---------- 8 files changed, 84 insertions(+), 100 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 37bd93c5..8caa488c 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -35,17 +35,15 @@ jobs: sudo apt install -y libegl1-mesa libgl1-mesa-dri libxcb-xfixes0-dev vulkan-sdk mesa-vulkan-drivers pkg-config libasound2-dev - name: Setup python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.10.6' cache: 'pip' - run: pip install -r requirements.txt + - name: Run tests + run: cargo test -- --nocapture - name: Install wasm-pack run: | curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh - - name: Build - run: cargo build - - name: Run tests - run: cargo test -- --nocapture - name: Run integration tests run: (cd crates/ratchet-integration-tests;sh run-tests.sh) diff --git a/Cargo.toml b/Cargo.toml index 6b06221c..a8cae20f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,7 @@ strip = true [workspace.dependencies] wgpu = { version = "0.18.0", features = ["fragile-send-sync-non-atomic-wasm", "expose-ids"] } anyhow = "1.0.40" -bytemuck = "1.14.0" +bytemuck = { version = "1.14.0", features=["wasm_simd", "aarch64_simd", "extern_crate_alloc"] } num-traits = "0.2.17" half = { version = "2.3.1", features = ["num-traits", "bytemuck"] } derive-new = "0.6.0" diff --git a/crates/ratchet-core/src/device.rs b/crates/ratchet-core/src/device.rs index d30e8da1..929f7f4a 100644 --- a/crates/ratchet-core/src/device.rs +++ b/crates/ratchet-core/src/device.rs @@ -51,7 +51,9 @@ impl Device { pub fn request_device(request: DeviceRequest) -> Result { match request { DeviceRequest::CPU => Ok(Device::CPU), - DeviceRequest::GPU => Ok(Device::GPU(pollster::block_on(WgpuDevice::new())?)), + DeviceRequest::GPU => Ok(Device::GPU(pollster::block_on(async { + WgpuDevice::new().await + })?)), } } diff --git a/crates/ratchet-core/src/quant.rs b/crates/ratchet-core/src/quant.rs index 1bbad339..c71b9178 100644 --- a/crates/ratchet-core/src/quant.rs +++ b/crates/ratchet-core/src/quant.rs @@ -122,12 +122,9 @@ mod tests { let mut rng = rand::thread_rng(); let range = Uniform::new(-0.2, 0.2); let matrix: Vec = (0..M * N).map(|_| rng.sample(range)).collect(); - println!("Original matrix: {:?}", matrix); let (quantized_matrix, absmax) = super::sint8_quantize(&matrix, M, N); - println!("Absmax: {:?}", absmax); let dequantized_matrix = super::sint8_dequantize(&quantized_matrix, &absmax, M, N); - println!("Dequantized matrix: {:?}", dequantized_matrix); for i in 0..matrix.len() { assert!((matrix[i] - dequantized_matrix[i]).abs() < 0.001); } @@ -138,12 +135,10 @@ mod tests { let matrix = vec![ 0.1, -0.1, 0.6, -0.5, 1.0, -1.0, 1.2, -1.2, 0.1, -0.1, 0.5, -0.5, 1.0, -1.0, 1.2, -1.2, ]; - println!("{:?}", matrix); let (quantized_matrix, absmax) = super::sint4_quantize(&matrix, 4, 4); assert_eq!(quantized_matrix.len(), 2); assert_eq!(quantized_matrix, vec![2544293105, 2544292849]); let dequantized_matrix = super::sint4_dequantize(&quantized_matrix, absmax, 4, 4); - println!("{:?}", dequantized_matrix); for i in 0..matrix.len() { assert!((matrix[i] - dequantized_matrix[i]).abs() < 0.1); } diff --git a/crates/ratchet-core/src/storage/cpu_buffer.rs b/crates/ratchet-core/src/storage/cpu_buffer.rs index d7ae546b..f1e409fd 100644 --- a/crates/ratchet-core/src/storage/cpu_buffer.rs +++ b/crates/ratchet-core/src/storage/cpu_buffer.rs @@ -10,6 +10,10 @@ use crate::DType; pub struct RawCPUBuffer(*mut u8, Layout); impl RawCPUBuffer { + pub fn from_raw_parts(ptr: *mut u8, layout: Layout) -> Self { + Self(ptr, layout) + } + pub fn into_raw_parts(&self) -> (*mut u8, Layout) { (self.0, self.1) } @@ -21,19 +25,36 @@ impl RawCPUBuffer { pub fn as_bytes(&self) -> &[u8] { unsafe { std::slice::from_raw_parts(self.0, self.1.size()) } } + + pub fn as_bytes_mut(&mut self) -> &mut [u8] { + unsafe { std::slice::from_raw_parts_mut(self.0, self.1.size()) } + } + + pub fn uninitialized(size: usize, alignment: usize) -> Self { + let layout = std::alloc::Layout::from_size_align(size, alignment).unwrap(); + let data = if size == 0 { + std::ptr::null() + } else { + let ptr = unsafe { std::alloc::alloc(layout) }; + assert!(!ptr.is_null()); + ptr + } as *mut u8; + Self(data, layout) + } } impl Clone for RawCPUBuffer { fn clone(&self) -> Self { - let data = if self.1.size() == 0 { + let (ptr, layout) = self.into_raw_parts(); + let data = if layout.size() == 0 { std::ptr::null() } else { - let ptr = unsafe { std::alloc::alloc(self.1) }; + let ptr = unsafe { std::alloc::alloc(layout) }; assert!(!ptr.is_null()); ptr } as *mut u8; - unsafe { self.0.copy_to_nonoverlapping(data, self.1.size()) }; - Self(data, self.1) + unsafe { ptr.copy_to_nonoverlapping(data, layout.size()) }; + Self(data, layout) } } @@ -48,7 +69,7 @@ impl Drop for RawCPUBuffer { /// Managed CPU buffer #[derive(Debug, Clone, derive_new::new)] pub struct CPUBuffer { - inner: Arc, + inner: RawCPUBuffer, } unsafe impl Send for CPUBuffer {} @@ -61,49 +82,25 @@ impl CPUBuffer { Self::from_bytes(bytes, std::mem::align_of::()) } - pub fn inner(&self) -> &Arc { + pub fn inner(&self) -> &RawCPUBuffer { &self.inner } - unsafe fn uninitialized(size: usize, alignment: usize) -> Self { - let layout = std::alloc::Layout::from_size_align(size, alignment).unwrap(); - let data = if size == 0 { - std::ptr::null() - } else { - let ptr = std::alloc::alloc(layout); - assert!(!ptr.is_null()); - ptr - } as *mut u8; - Self::from_raw_parts(data, layout) - } - - pub fn from_raw_parts(data: *mut u8, layout: Layout) -> Self { - Self { - inner: Arc::new(RawCPUBuffer(data, layout)), - } - } - pub fn from_bytes(bytes: &[u8], alignment: usize) -> Self { - let layout = std::alloc::Layout::from_size_align(bytes.len(), alignment).unwrap(); - let data = if bytes.len() == 0 { - std::ptr::null() - } else { - let ptr = unsafe { std::alloc::alloc(layout) }; - assert!(!ptr.is_null()); - unsafe { ptr.copy_from_nonoverlapping(bytes.as_ptr(), bytes.len()) }; - ptr - } as *mut u8; - Self::from_raw_parts(data, layout) + let mut raw = RawCPUBuffer::uninitialized(bytes.len(), alignment); + raw.as_bytes_mut().copy_from_slice(bytes); + Self::from(raw) } pub fn deep_clone(&self) -> Self { - let (ptr, layout) = self.inner().into_raw_parts(); - println!("before deep clone: {:p}", ptr); - let alloc = unsafe { std::alloc::alloc(layout) }; - unsafe { ptr.copy_to_nonoverlapping(alloc, layout.size()) }; - println!("after deep clone: {:p}", alloc); + let raw_clone = (*self.inner()).clone(); + Self::from(raw_clone) + } +} - Self::from_raw_parts(alloc, layout) +impl From for CPUBuffer { + fn from(raw: RawCPUBuffer) -> Self { + CPUBuffer { inner: raw } } } diff --git a/crates/ratchet-core/src/storage/gpu_buffer.rs b/crates/ratchet-core/src/storage/gpu_buffer.rs index cd268568..1631d3c7 100644 --- a/crates/ratchet-core/src/storage/gpu_buffer.rs +++ b/crates/ratchet-core/src/storage/gpu_buffer.rs @@ -65,7 +65,6 @@ impl GPUBuffer { } pub fn deep_clone(&self, device: &WgpuDevice) -> Self { - //Here we need to create a buffer just like ours let clone = device .get_or_create_buffer(&BufferDescriptor::new( self.inner.size(), diff --git a/crates/ratchet-core/src/storage/mod.rs b/crates/ratchet-core/src/storage/mod.rs index 921a6c20..f6bcc0d3 100644 --- a/crates/ratchet-core/src/storage/mod.rs +++ b/crates/ratchet-core/src/storage/mod.rs @@ -46,11 +46,8 @@ impl Storage { pub fn deep_clone(&self, device: &Device) -> Result { match self { - Storage::CPU(buf) => Ok(Storage::CPU(buf.deep_clone())), - Storage::GPU(buf) => { - let gpu_device = device.try_gpu()?; - Ok(Storage::GPU(buf.deep_clone(gpu_device))) - } + Storage::CPU(c) => Ok(Storage::CPU(c.deep_clone())), + _ => unimplemented!(), } } } diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index a627f8dc..dcadedfe 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -1,7 +1,7 @@ use crate::gpu::{CpuUniform, WgpuDevice}; use crate::{ ops::*, CPUBuffer, CompiledOp, DType, Device, DeviceStorage, Executable, GPUBuffer, Operation, - OperationError, Shape, Storage, Strides, TensorDType, TensorId, + OperationError, RawCPUBuffer, Shape, Storage, Strides, TensorDType, TensorId, }; use crate::{BinaryOp, LazyOp}; @@ -340,29 +340,32 @@ impl Tensor { let storage_guard = self.storage(); let buffer = storage_guard.as_ref().unwrap().try_cpu().unwrap(); let (ptr, _) = buffer.inner().into_raw_parts(); - unsafe { ArrayViewD::from_shape_ptr(shape, ptr as *const T).into_owned() } + unsafe { ArrayViewD::from_shape_ptr(shape, ptr as *const T).to_owned() } } else { - ArrayViewD::from_shape(shape, &[]).unwrap().into_owned() + ArrayViewD::from_shape(shape, &[]).unwrap().to_owned() } } #[cfg(feature = "pyo3")] - pub fn to_py<'p, T: TensorDType + numpy::Element>( - self, - py: pyo3::Python<'p>, + pub fn to_py<'s, 'p: 's, T: TensorDType + numpy::Element>( + &'s self, + py: &'p pyo3::Python<'p>, ) -> &PyArrayDyn { use numpy::PyArray; - PyArray::from_owned_array(py, self.into_ndarray::()) + PyArray::from_owned_array(*py, self.clone().into_ndarray::()) } pub fn deep_clone(&self) -> Tensor { - let storage_guard = self.storage(); - let storage = storage_guard.as_ref().unwrap(); - let cloned_storage = storage.deep_clone(self.device()).unwrap(); + let storage_clone = self + .storage() + .as_ref() + .unwrap() + .deep_clone(self.device()) + .unwrap(); Tensor::new( - LazyOp::Const, + self.op().clone(), self.view.clone(), - Some(cloned_storage), + Some(storage_clone), self.device.clone(), ) } @@ -382,12 +385,12 @@ impl From> for Tensor { let vec = it.into_raw_vec().into_boxed_slice(); let ptr = Box::into_raw(vec) as *mut u8; - let cpu_buf = CPUBuffer::from_raw_parts(ptr, layout); + let raw_buf = RawCPUBuffer::from_raw_parts(ptr, layout); let meta = StorageView::new(shape, T::dt(), strides); Tensor::new( LazyOp::Const, meta, - Some(Storage::CPU(cpu_buf)), + Some(Storage::CPU(CPUBuffer::from(raw_buf))), Device::CPU, ) } else { @@ -414,15 +417,12 @@ mod tests { #[test] fn test_matmul() -> anyhow::Result<()> { let device = Device::request_device(DeviceRequest::GPU)?; - let a = Tensor::randn::(shape![512, 512], device.clone()); - let b = Tensor::randn::(shape![512, 512], device.clone()); + let a = Tensor::randn::(shape![1024, 1024], device.clone()); + let b = Tensor::randn::(shape![1024, 1024], device.clone()); let c = a.matmul(&b)?; c.resolve()?; - println!("\nA: {:#?}", a); - println!("\nB: {:#?}", b); - println!("\nC: {:#?}", c); let d = c.to(Device::CPU)?; - println!("\nD: {:#?}", d); + println!("{:?}", d); Ok(()) } @@ -436,36 +436,32 @@ mod tests { let prg = PyModule::from_code( py, r#" -import torch + import torch -def matmul(a, b): - return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() - "#, + def matmul(a, b): + return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() + "#, "x.py", "x", )?; - let result = prg + let py_a = a.to_py::(&py); + let py_b = b.to_py::(&py); + + let py_c = prg .getattr("matmul")? - .call1((a.clone().to_py::(py), b.clone().to_py::(py)))? + .call1((py_a, py_b))? .extract::<&PyArrayDyn>()?; - Ok(Tensor::from(result)) + Ok(Tensor::from(py_c)) }); - println!("\nTORCH: {:#?}", ground); - - println!("\nA: {:#?}", a); - println!("\nB: {:#?}", b); - - let gpu_device = Device::request_device(DeviceRequest::GPU)?; - let a = a.to(gpu_device.clone())?; - let b = b.to(gpu_device)?; - - let c = a.matmul(&b)?; - c.resolve()?; - - let our_result = c.to(cpu_device)?; - println!("\nOURS: {:#?}", our_result); - + let device = Device::request_device(DeviceRequest::GPU)?; + let a_gpu = a.to(device.clone())?; + let b_gpu = b.to(device.clone())?; + let c_gpu = a_gpu.matmul(&b_gpu)?; + c_gpu.resolve()?; + let d_gpu = c_gpu.to(Device::CPU)?; + println!("Ours: {:?}", d_gpu); + println!("Ground: {:?}", ground); Ok(()) } } From 52dbfc351ad1a0fd3af3698396453685401824e2 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Mon, 22 Jan 2024 13:11:29 +0000 Subject: [PATCH 20/37] chore: try --- .github/workflows/rust.yml | 2 +- crates/ratchet-core/src/tensor.rs | 18 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 8caa488c..8c2432b5 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -41,7 +41,7 @@ jobs: cache: 'pip' - run: pip install -r requirements.txt - name: Run tests - run: cargo test -- --nocapture + run: cargo test dbg -- --nocapture - name: Install wasm-pack run: | curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index dcadedfe..dacb43bb 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -415,17 +415,20 @@ mod tests { use super::*; #[test] - fn test_matmul() -> anyhow::Result<()> { + fn dbg() -> anyhow::Result<()> { let device = Device::request_device(DeviceRequest::GPU)?; - let a = Tensor::randn::(shape![1024, 1024], device.clone()); - let b = Tensor::randn::(shape![1024, 1024], device.clone()); - let c = a.matmul(&b)?; - c.resolve()?; - let d = c.to(Device::CPU)?; - println!("{:?}", d); + for _ in 0..10 { + let a = Tensor::randn::(shape![128, 128], device.clone()); + let b = Tensor::randn::(shape![128, 128], device.clone()); + let c = a.matmul(&b)?; + c.resolve()?; + let d = c.to(Device::CPU)?; + println!("{:?}", d); + } Ok(()) } + /* #[test] fn test_pyo3() -> anyhow::Result<()> { let cpu_device = Device::request_device(DeviceRequest::CPU)?; @@ -464,4 +467,5 @@ mod tests { println!("Ground: {:?}", ground); Ok(()) } + */ } From 1528bdcfa334811598c7591513f36200a7451cd4 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Mon, 22 Jan 2024 13:16:58 +0000 Subject: [PATCH 21/37] chore: try --- .github/workflows/rust.yml | 2 +- crates/ratchet-core/src/tensor.rs | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 8c2432b5..b7356739 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -41,7 +41,7 @@ jobs: cache: 'pip' - run: pip install -r requirements.txt - name: Run tests - run: cargo test dbg -- --nocapture + run: cargo test tensor -- --nocapture - name: Install wasm-pack run: | curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index dacb43bb..8e0467a1 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -416,13 +416,16 @@ mod tests { #[test] fn dbg() -> anyhow::Result<()> { - let device = Device::request_device(DeviceRequest::GPU)?; + let cpu_device = Device::request_device(DeviceRequest::CPU)?; + let gpu_device = Device::request_device(DeviceRequest::GPU)?; for _ in 0..10 { - let a = Tensor::randn::(shape![128, 128], device.clone()); - let b = Tensor::randn::(shape![128, 128], device.clone()); - let c = a.matmul(&b)?; - c.resolve()?; - let d = c.to(Device::CPU)?; + let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + + let a_gpu = a.to(gpu_device.clone())?; + let b_gpu = b.to(gpu_device.clone())?; + let c_gpu = a_gpu.matmul(&b_gpu)?; + let d = c_gpu.to(Device::CPU)?; println!("{:?}", d); } Ok(()) From 54b6525210157c07b847a0421fc3e5ae3244be31 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Mon, 22 Jan 2024 13:19:10 +0000 Subject: [PATCH 22/37] chore: try --- crates/ratchet-core/src/tensor.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index 8e0467a1..3cdf699d 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -419,12 +419,13 @@ mod tests { let cpu_device = Device::request_device(DeviceRequest::CPU)?; let gpu_device = Device::request_device(DeviceRequest::GPU)?; for _ in 0..10 { - let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); - let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + let a = Tensor::randn::(shape![128, 128], cpu_device.clone()); + let b = Tensor::randn::(shape![128, 128], cpu_device.clone()); let a_gpu = a.to(gpu_device.clone())?; let b_gpu = b.to(gpu_device.clone())?; let c_gpu = a_gpu.matmul(&b_gpu)?; + c_gpu.resolve()?; let d = c_gpu.to(Device::CPU)?; println!("{:?}", d); } From a7d41fc4c1b77fc2d6ecba2b3e58eaf8bc5fc952 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Mon, 22 Jan 2024 13:23:16 +0000 Subject: [PATCH 23/37] chore: try --- crates/ratchet-core/src/tensor.rs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index 3cdf699d..66001718 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -414,6 +414,7 @@ mod tests { use super::*; + /* #[test] fn dbg() -> anyhow::Result<()> { let cpu_device = Device::request_device(DeviceRequest::CPU)?; @@ -431,8 +432,8 @@ mod tests { } Ok(()) } + */ - /* #[test] fn test_pyo3() -> anyhow::Result<()> { let cpu_device = Device::request_device(DeviceRequest::CPU)?; @@ -443,24 +444,28 @@ mod tests { let prg = PyModule::from_code( py, r#" - import torch +import torch - def matmul(a, b): - return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() +def matmul(a, b): + return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() "#, "x.py", "x", )?; let py_a = a.to_py::(&py); + println!("py_a: {:?}", py_a); let py_b = b.to_py::(&py); + println!("py_b: {:?}", py_b); let py_c = prg .getattr("matmul")? .call1((py_a, py_b))? .extract::<&PyArrayDyn>()?; + println!("py_c: {:?}", py_c); Ok(Tensor::from(py_c)) }); + println!("Ground: {:?}", ground); let device = Device::request_device(DeviceRequest::GPU)?; let a_gpu = a.to(device.clone())?; let b_gpu = b.to(device.clone())?; @@ -471,5 +476,4 @@ mod tests { println!("Ground: {:?}", ground); Ok(()) } - */ } From 2d7d467a717b4966c92737213ded6db056b5a5ea Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Mon, 22 Jan 2024 13:29:25 +0000 Subject: [PATCH 24/37] chore: try --- crates/ratchet-core/src/tensor.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index 66001718..8113d46a 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -414,7 +414,6 @@ mod tests { use super::*; - /* #[test] fn dbg() -> anyhow::Result<()> { let cpu_device = Device::request_device(DeviceRequest::CPU)?; @@ -432,7 +431,6 @@ mod tests { } Ok(()) } - */ #[test] fn test_pyo3() -> anyhow::Result<()> { From d0ebe04ba671c88fdfc41ff4661067a0a5c90ff7 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Mon, 22 Jan 2024 13:36:09 +0000 Subject: [PATCH 25/37] chore: try --- .github/workflows/rust.yml | 2 +- crates/ratchet-core/src/storage/cpu_buffer.rs | 5 ----- crates/ratchet-core/src/storage/gpu_buffer.rs | 1 + crates/ratchet-core/src/storage/mod.rs | 7 ------- crates/ratchet-core/src/tensor.rs | 15 --------------- 5 files changed, 2 insertions(+), 28 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index b7356739..cf4b5480 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -41,7 +41,7 @@ jobs: cache: 'pip' - run: pip install -r requirements.txt - name: Run tests - run: cargo test tensor -- --nocapture + run: cargo test tensor -- --test-threads=1 --nocapture - name: Install wasm-pack run: | curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh diff --git a/crates/ratchet-core/src/storage/cpu_buffer.rs b/crates/ratchet-core/src/storage/cpu_buffer.rs index f1e409fd..11323795 100644 --- a/crates/ratchet-core/src/storage/cpu_buffer.rs +++ b/crates/ratchet-core/src/storage/cpu_buffer.rs @@ -91,11 +91,6 @@ impl CPUBuffer { raw.as_bytes_mut().copy_from_slice(bytes); Self::from(raw) } - - pub fn deep_clone(&self) -> Self { - let raw_clone = (*self.inner()).clone(); - Self::from(raw_clone) - } } impl From for CPUBuffer { diff --git a/crates/ratchet-core/src/storage/gpu_buffer.rs b/crates/ratchet-core/src/storage/gpu_buffer.rs index 1631d3c7..3592e004 100644 --- a/crates/ratchet-core/src/storage/gpu_buffer.rs +++ b/crates/ratchet-core/src/storage/gpu_buffer.rs @@ -64,6 +64,7 @@ impl GPUBuffer { self.inner.usage() } + #[allow(unused)] pub fn deep_clone(&self, device: &WgpuDevice) -> Self { let clone = device .get_or_create_buffer(&BufferDescriptor::new( diff --git a/crates/ratchet-core/src/storage/mod.rs b/crates/ratchet-core/src/storage/mod.rs index f6bcc0d3..fe051014 100644 --- a/crates/ratchet-core/src/storage/mod.rs +++ b/crates/ratchet-core/src/storage/mod.rs @@ -43,13 +43,6 @@ impl Storage { _ => unimplemented!(), } } - - pub fn deep_clone(&self, device: &Device) -> Result { - match self { - Storage::CPU(c) => Ok(Storage::CPU(c.deep_clone())), - _ => unimplemented!(), - } - } } pub trait DeviceStorage: std::fmt::Debug + Clone + 'static { diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index 8113d46a..ee595e87 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -354,21 +354,6 @@ impl Tensor { use numpy::PyArray; PyArray::from_owned_array(*py, self.clone().into_ndarray::()) } - - pub fn deep_clone(&self) -> Tensor { - let storage_clone = self - .storage() - .as_ref() - .unwrap() - .deep_clone(self.device()) - .unwrap(); - Tensor::new( - self.op().clone(), - self.view.clone(), - Some(storage_clone), - self.device.clone(), - ) - } } #[cfg(feature = "pyo3")] From bf5af2bfebd185078c372038a6e3415f22b3fbd3 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Mon, 22 Jan 2024 13:44:40 +0000 Subject: [PATCH 26/37] chore: no idea --- crates/ratchet-core/src/storage/cpu_buffer.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/ratchet-core/src/storage/cpu_buffer.rs b/crates/ratchet-core/src/storage/cpu_buffer.rs index 11323795..dac52b4a 100644 --- a/crates/ratchet-core/src/storage/cpu_buffer.rs +++ b/crates/ratchet-core/src/storage/cpu_buffer.rs @@ -61,6 +61,7 @@ impl Clone for RawCPUBuffer { impl Drop for RawCPUBuffer { fn drop(&mut self) { if !self.0.is_null() && self.1.size() > 0 { + println!("DROPPING: {:p}", self.0); unsafe { std::alloc::dealloc(self.0, self.1) } } } From 71a43ac10e16d5d6682bedfacd397d26f3d642df Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Mon, 22 Jan 2024 15:43:00 +0000 Subject: [PATCH 27/37] chore: no idea --- crates/ratchet-core/src/tensor.rs | 55 ++++++++++++++++--------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index ee595e87..c553dc53 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -423,32 +423,34 @@ mod tests { let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); - let ground: anyhow::Result = Python::with_gil(|py| { - let prg = PyModule::from_code( - py, - r#" -import torch - -def matmul(a, b): - return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() - "#, - "x.py", - "x", - )?; - - let py_a = a.to_py::(&py); - println!("py_a: {:?}", py_a); - let py_b = b.to_py::(&py); - println!("py_b: {:?}", py_b); - - let py_c = prg - .getattr("matmul")? - .call1((py_a, py_b))? - .extract::<&PyArrayDyn>()?; - println!("py_c: {:?}", py_c); - Ok(Tensor::from(py_c)) - }); - println!("Ground: {:?}", ground); + /* + let ground: anyhow::Result = Python::with_gil(|py| { + let prg = PyModule::from_code( + py, + r#" + import torch + + def matmul(a, b): + return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() + "#, + "x.py", + "x", + )?; + + let py_a = a.to_py::(&py); + println!("py_a: {:?}", py_a); + let py_b = b.to_py::(&py); + println!("py_b: {:?}", py_b); + + let py_c = prg + .getattr("matmul")? + .call1((py_a, py_b))? + .extract::<&PyArrayDyn>()?; + println!("py_c: {:?}", py_c); + Ok(Tensor::from(py_c)) + }); + println!("Ground: {:?}", ground); + */ let device = Device::request_device(DeviceRequest::GPU)?; let a_gpu = a.to(device.clone())?; let b_gpu = b.to(device.clone())?; @@ -456,7 +458,6 @@ def matmul(a, b): c_gpu.resolve()?; let d_gpu = c_gpu.to(Device::CPU)?; println!("Ours: {:?}", d_gpu); - println!("Ground: {:?}", ground); Ok(()) } } From 8f0dd2b65dec5e2447af195aa7739a94622b8397 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Mon, 22 Jan 2024 15:57:55 +0000 Subject: [PATCH 28/37] chore: no idea --- .github/workflows/rust.yml | 2 +- crates/ratchet-core/src/storage/cpu_buffer.rs | 6 +- crates/ratchet-core/src/storage/mod.rs | 7 ++ crates/ratchet-core/src/tensor.rs | 68 +++++++++++-------- 4 files changed, 54 insertions(+), 29 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index cf4b5480..b7356739 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -41,7 +41,7 @@ jobs: cache: 'pip' - run: pip install -r requirements.txt - name: Run tests - run: cargo test tensor -- --test-threads=1 --nocapture + run: cargo test tensor -- --nocapture - name: Install wasm-pack run: | curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh diff --git a/crates/ratchet-core/src/storage/cpu_buffer.rs b/crates/ratchet-core/src/storage/cpu_buffer.rs index dac52b4a..9c870f7d 100644 --- a/crates/ratchet-core/src/storage/cpu_buffer.rs +++ b/crates/ratchet-core/src/storage/cpu_buffer.rs @@ -2,7 +2,7 @@ use bytemuck::NoUninit; use crate::{storage::DeviceStorage, Device, DeviceError, GPUBuffer, Shape, TensorDType}; -use std::{alloc::Layout, fmt::Debug, sync::Arc}; +use std::{alloc::Layout, fmt::Debug}; use crate::DType; @@ -92,6 +92,10 @@ impl CPUBuffer { raw.as_bytes_mut().copy_from_slice(bytes); Self::from(raw) } + + pub fn deep_clone(&self) -> Result { + Ok(Self::from(self.inner().clone())) + } } impl From for CPUBuffer { diff --git a/crates/ratchet-core/src/storage/mod.rs b/crates/ratchet-core/src/storage/mod.rs index fe051014..66652e85 100644 --- a/crates/ratchet-core/src/storage/mod.rs +++ b/crates/ratchet-core/src/storage/mod.rs @@ -43,6 +43,13 @@ impl Storage { _ => unimplemented!(), } } + + pub fn deep_clone(&self) -> Result { + match self { + Storage::CPU(c) => Ok(Storage::CPU(c.deep_clone()?)), + _ => todo!(), + } + } } pub trait DeviceStorage: std::fmt::Debug + Clone + 'static { diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index c553dc53..8b7382b1 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -352,7 +352,23 @@ impl Tensor { py: &'p pyo3::Python<'p>, ) -> &PyArrayDyn { use numpy::PyArray; - PyArray::from_owned_array(*py, self.clone().into_ndarray::()) + assert!( + self.device().is_cpu(), + "Cannot convert non-CPU tensor to numpy array" + ); + PyArray::from_owned_array(*py, self.deep_clone().into_ndarray::()) + } + + pub fn deep_clone(&self) -> Tensor { + let storage_guard = self.storage(); + let storage = storage_guard.as_ref().unwrap(); + let cloned_storage = storage.deep_clone().unwrap(); + Tensor::new( + LazyOp::Const, + self.view.clone(), + Some(cloned_storage), + self.device.clone(), + ) } } @@ -423,34 +439,32 @@ mod tests { let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); - /* - let ground: anyhow::Result = Python::with_gil(|py| { - let prg = PyModule::from_code( - py, - r#" - import torch + let ground: anyhow::Result = Python::with_gil(|py| { + let prg = PyModule::from_code( + py, + r#" +import torch - def matmul(a, b): - return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() +def matmul(a, b): + return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() "#, - "x.py", - "x", - )?; - - let py_a = a.to_py::(&py); - println!("py_a: {:?}", py_a); - let py_b = b.to_py::(&py); - println!("py_b: {:?}", py_b); - - let py_c = prg - .getattr("matmul")? - .call1((py_a, py_b))? - .extract::<&PyArrayDyn>()?; - println!("py_c: {:?}", py_c); - Ok(Tensor::from(py_c)) - }); - println!("Ground: {:?}", ground); - */ + "x.py", + "x", + )?; + + let py_a = a.to_py::(&py); + println!("py_a: {:?}", py_a); + let py_b = b.to_py::(&py); + println!("py_b: {:?}", py_b); + + let py_c = prg + .getattr("matmul")? + .call1((py_a, py_b))? + .extract::<&PyArrayDyn>()?; + println!("py_c: {:?}", py_c); + Ok(Tensor::from(py_c)) + }); + println!("Ground: {:?}", ground); let device = Device::request_device(DeviceRequest::GPU)?; let a_gpu = a.to(device.clone())?; let b_gpu = b.to(device.clone())?; From d63d2b6b6b10b99ffabecb710446380e5d01f8c9 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Mon, 22 Jan 2024 16:06:58 +0000 Subject: [PATCH 29/37] chore: no idea --- crates/ratchet-core/src/storage/cpu_buffer.rs | 6 +- crates/ratchet-core/src/tensor.rs | 75 +++++++++---------- 2 files changed, 39 insertions(+), 42 deletions(-) diff --git a/crates/ratchet-core/src/storage/cpu_buffer.rs b/crates/ratchet-core/src/storage/cpu_buffer.rs index 9c870f7d..e6c26465 100644 --- a/crates/ratchet-core/src/storage/cpu_buffer.rs +++ b/crates/ratchet-core/src/storage/cpu_buffer.rs @@ -10,10 +10,6 @@ use crate::DType; pub struct RawCPUBuffer(*mut u8, Layout); impl RawCPUBuffer { - pub fn from_raw_parts(ptr: *mut u8, layout: Layout) -> Self { - Self(ptr, layout) - } - pub fn into_raw_parts(&self) -> (*mut u8, Layout) { (self.0, self.1) } @@ -39,6 +35,7 @@ impl RawCPUBuffer { assert!(!ptr.is_null()); ptr } as *mut u8; + println!("Unintialized: {:p}", data); Self(data, layout) } } @@ -53,6 +50,7 @@ impl Clone for RawCPUBuffer { assert!(!ptr.is_null()); ptr } as *mut u8; + println!("Cloning: {:p} -> {:p}", ptr, data); unsafe { ptr.copy_to_nonoverlapping(data, layout.size()) }; Self(data, layout) } diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index 8b7382b1..f62235c0 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -386,7 +386,7 @@ impl From> for Tensor { let vec = it.into_raw_vec().into_boxed_slice(); let ptr = Box::into_raw(vec) as *mut u8; - let raw_buf = RawCPUBuffer::from_raw_parts(ptr, layout); + let raw_buf = RawCPUBuffer::new(ptr, layout); let meta = StorageView::new(shape, T::dt(), strides); Tensor::new( LazyOp::Const, @@ -419,17 +419,15 @@ mod tests { fn dbg() -> anyhow::Result<()> { let cpu_device = Device::request_device(DeviceRequest::CPU)?; let gpu_device = Device::request_device(DeviceRequest::GPU)?; - for _ in 0..10 { - let a = Tensor::randn::(shape![128, 128], cpu_device.clone()); - let b = Tensor::randn::(shape![128, 128], cpu_device.clone()); - - let a_gpu = a.to(gpu_device.clone())?; - let b_gpu = b.to(gpu_device.clone())?; - let c_gpu = a_gpu.matmul(&b_gpu)?; - c_gpu.resolve()?; - let d = c_gpu.to(Device::CPU)?; - println!("{:?}", d); - } + let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + + let a_gpu = a.to(gpu_device.clone())?; + let b_gpu = b.to(gpu_device.clone())?; + let c_gpu = a_gpu.matmul(&b_gpu)?; + c_gpu.resolve()?; + let d = c_gpu.to(Device::CPU)?; + println!("{:?}", d); Ok(()) } @@ -439,32 +437,33 @@ mod tests { let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); - let ground: anyhow::Result = Python::with_gil(|py| { - let prg = PyModule::from_code( - py, - r#" -import torch - -def matmul(a, b): - return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() - "#, - "x.py", - "x", - )?; - - let py_a = a.to_py::(&py); - println!("py_a: {:?}", py_a); - let py_b = b.to_py::(&py); - println!("py_b: {:?}", py_b); - - let py_c = prg - .getattr("matmul")? - .call1((py_a, py_b))? - .extract::<&PyArrayDyn>()?; - println!("py_c: {:?}", py_c); - Ok(Tensor::from(py_c)) - }); - println!("Ground: {:?}", ground); + /* + let ground: anyhow::Result = Python::with_gil(|py| { + let prg = PyModule::from_code( + py, + r#" + import torch + + def matmul(a, b): + return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() + "#, + "x.py", + "x", + )?; + + let py_a = a.to_py::(&py); + println!("py_a: {:?}", py_a); + let py_b = b.to_py::(&py); + println!("py_b: {:?}", py_b); + + let py_c = prg + .getattr("matmul")? + .call1((py_a, py_b))? + .extract::<&PyArrayDyn>()?; + println!("py_c: {:?}", py_c); + Ok(Tensor::from(py_c)) + }); + */ let device = Device::request_device(DeviceRequest::GPU)?; let a_gpu = a.to(device.clone())?; let b_gpu = b.to(device.clone())?; From 361091853ccfa27b8b9e5d18ea2b4b9693334540 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Mon, 22 Jan 2024 16:08:52 +0000 Subject: [PATCH 30/37] chore: no idea --- crates/ratchet-core/src/storage/cpu_buffer.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/crates/ratchet-core/src/storage/cpu_buffer.rs b/crates/ratchet-core/src/storage/cpu_buffer.rs index e6c26465..3b170e6f 100644 --- a/crates/ratchet-core/src/storage/cpu_buffer.rs +++ b/crates/ratchet-core/src/storage/cpu_buffer.rs @@ -105,9 +105,8 @@ impl From for CPUBuffer { impl DeviceStorage for CPUBuffer { fn to_device(&self, device: &Device) -> Result { let gpu_device = device.try_gpu()?; - let raw = self.inner(); - let (ptr, layout) = raw.into_raw_parts(); - let bytes = unsafe { std::slice::from_raw_parts(ptr, layout.size()) }; + let bytes = self.inner().as_bytes(); + let layout = self.inner().1; Ok(GPUBuffer::from_bytes(bytes, layout.align(), gpu_device)) } From 4bdcadaf5aa885e3c6a31d12bc4874e1060739a9 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Mon, 22 Jan 2024 16:14:41 +0000 Subject: [PATCH 31/37] chore: no idea --- crates/ratchet-core/src/storage/cpu_buffer.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/ratchet-core/src/storage/cpu_buffer.rs b/crates/ratchet-core/src/storage/cpu_buffer.rs index 3b170e6f..5281bd99 100644 --- a/crates/ratchet-core/src/storage/cpu_buffer.rs +++ b/crates/ratchet-core/src/storage/cpu_buffer.rs @@ -19,6 +19,7 @@ impl RawCPUBuffer { } pub fn as_bytes(&self) -> &[u8] { + println!("Reading: {:p}", self.0); unsafe { std::slice::from_raw_parts(self.0, self.1.size()) } } From d19f5a45a13b0631dcc5e09523df296b79999b06 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Mon, 22 Jan 2024 16:16:56 +0000 Subject: [PATCH 32/37] chore: no idea --- crates/ratchet-core/src/tensor.rs | 90 ++++++++++++++++--------------- 1 file changed, 48 insertions(+), 42 deletions(-) diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index f62235c0..0f6aaa4b 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -428,49 +428,55 @@ mod tests { c_gpu.resolve()?; let d = c_gpu.to(Device::CPU)?; println!("{:?}", d); + let a_cpu = a_gpu.to(Device::CPU)?; + println!("{:?}", a_cpu); + let b_cpu = b_gpu.to(Device::CPU)?; + println!("{:?}", b_cpu); Ok(()) } - #[test] - fn test_pyo3() -> anyhow::Result<()> { - let cpu_device = Device::request_device(DeviceRequest::CPU)?; - let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); - let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); - - /* - let ground: anyhow::Result = Python::with_gil(|py| { - let prg = PyModule::from_code( - py, - r#" - import torch - - def matmul(a, b): - return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() - "#, - "x.py", - "x", - )?; - - let py_a = a.to_py::(&py); - println!("py_a: {:?}", py_a); - let py_b = b.to_py::(&py); - println!("py_b: {:?}", py_b); - - let py_c = prg - .getattr("matmul")? - .call1((py_a, py_b))? - .extract::<&PyArrayDyn>()?; - println!("py_c: {:?}", py_c); - Ok(Tensor::from(py_c)) - }); - */ - let device = Device::request_device(DeviceRequest::GPU)?; - let a_gpu = a.to(device.clone())?; - let b_gpu = b.to(device.clone())?; - let c_gpu = a_gpu.matmul(&b_gpu)?; - c_gpu.resolve()?; - let d_gpu = c_gpu.to(Device::CPU)?; - println!("Ours: {:?}", d_gpu); - Ok(()) - } + /* + #[test] + fn test_pyo3() -> anyhow::Result<()> { + let cpu_device = Device::request_device(DeviceRequest::CPU)?; + let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + + /* + let ground: anyhow::Result = Python::with_gil(|py| { + let prg = PyModule::from_code( + py, + r#" + import torch + + def matmul(a, b): + return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() + "#, + "x.py", + "x", + )?; + + let py_a = a.to_py::(&py); + println!("py_a: {:?}", py_a); + let py_b = b.to_py::(&py); + println!("py_b: {:?}", py_b); + + let py_c = prg + .getattr("matmul")? + .call1((py_a, py_b))? + .extract::<&PyArrayDyn>()?; + println!("py_c: {:?}", py_c); + Ok(Tensor::from(py_c)) + }); + */ + let device = Device::request_device(DeviceRequest::GPU)?; + let a_gpu = a.to(device.clone())?; + let b_gpu = b.to(device.clone())?; + let c_gpu = a_gpu.matmul(&b_gpu)?; + c_gpu.resolve()?; + let d_gpu = c_gpu.to(Device::CPU)?; + println!("Ours: {:?}", d_gpu); + Ok(()) + } + */ } From aa651c2bfe71b96908fbd3fa79763ca251ad37ca Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Mon, 22 Jan 2024 16:21:38 +0000 Subject: [PATCH 33/37] chore: no idea --- crates/ratchet-core/src/tensor.rs | 88 +++++++++++++++---------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index 0f6aaa4b..06b72fd7 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -415,6 +415,7 @@ mod tests { use super::*; + /* #[test] fn dbg() -> anyhow::Result<()> { let cpu_device = Device::request_device(DeviceRequest::CPU)?; @@ -434,49 +435,48 @@ mod tests { println!("{:?}", b_cpu); Ok(()) } - - /* - #[test] - fn test_pyo3() -> anyhow::Result<()> { - let cpu_device = Device::request_device(DeviceRequest::CPU)?; - let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); - let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); - - /* - let ground: anyhow::Result = Python::with_gil(|py| { - let prg = PyModule::from_code( - py, - r#" - import torch - - def matmul(a, b): - return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() - "#, - "x.py", - "x", - )?; - - let py_a = a.to_py::(&py); - println!("py_a: {:?}", py_a); - let py_b = b.to_py::(&py); - println!("py_b: {:?}", py_b); - - let py_c = prg - .getattr("matmul")? - .call1((py_a, py_b))? - .extract::<&PyArrayDyn>()?; - println!("py_c: {:?}", py_c); - Ok(Tensor::from(py_c)) - }); - */ - let device = Device::request_device(DeviceRequest::GPU)?; - let a_gpu = a.to(device.clone())?; - let b_gpu = b.to(device.clone())?; - let c_gpu = a_gpu.matmul(&b_gpu)?; - c_gpu.resolve()?; - let d_gpu = c_gpu.to(Device::CPU)?; - println!("Ours: {:?}", d_gpu); - Ok(()) - } */ + + #[test] + fn test_pyo3() -> anyhow::Result<()> { + let cpu_device = Device::request_device(DeviceRequest::CPU)?; + let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + + /* + let ground: anyhow::Result = Python::with_gil(|py| { + let prg = PyModule::from_code( + py, + r#" + import torch + + def matmul(a, b): + return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() + "#, + "x.py", + "x", + )?; + + let py_a = a.to_py::(&py); + println!("py_a: {:?}", py_a); + let py_b = b.to_py::(&py); + println!("py_b: {:?}", py_b); + + let py_c = prg + .getattr("matmul")? + .call1((py_a, py_b))? + .extract::<&PyArrayDyn>()?; + println!("py_c: {:?}", py_c); + Ok(Tensor::from(py_c)) + }); + */ + let device = Device::request_device(DeviceRequest::GPU)?; + let a_gpu = a.to(device.clone())?; + let b_gpu = b.to(device.clone())?; + let c_gpu = a_gpu.matmul(&b_gpu)?; + c_gpu.resolve()?; + let d_gpu = c_gpu.to(Device::CPU)?; + println!("Ours: {:?}", d_gpu); + Ok(()) + } } From 4a1e0f0124861e6ccb29a08e1a5e97ff80f02735 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Mon, 22 Jan 2024 16:26:00 +0000 Subject: [PATCH 34/37] chore: no idea --- crates/ratchet-core/src/tensor.rs | 108 ++++++++++++++++++++---------- 1 file changed, 73 insertions(+), 35 deletions(-) diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index 06b72fd7..177e4ae5 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -415,7 +415,6 @@ mod tests { use super::*; - /* #[test] fn dbg() -> anyhow::Result<()> { let cpu_device = Device::request_device(DeviceRequest::CPU)?; @@ -435,48 +434,87 @@ mod tests { println!("{:?}", b_cpu); Ok(()) } - */ #[test] - fn test_pyo3() -> anyhow::Result<()> { + fn dbg2() -> anyhow::Result<()> { let cpu_device = Device::request_device(DeviceRequest::CPU)?; + let gpu_device = Device::request_device(DeviceRequest::GPU)?; let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); - /* - let ground: anyhow::Result = Python::with_gil(|py| { - let prg = PyModule::from_code( - py, - r#" - import torch - - def matmul(a, b): - return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() - "#, - "x.py", - "x", - )?; - - let py_a = a.to_py::(&py); - println!("py_a: {:?}", py_a); - let py_b = b.to_py::(&py); - println!("py_b: {:?}", py_b); - - let py_c = prg - .getattr("matmul")? - .call1((py_a, py_b))? - .extract::<&PyArrayDyn>()?; - println!("py_c: {:?}", py_c); - Ok(Tensor::from(py_c)) - }); - */ - let device = Device::request_device(DeviceRequest::GPU)?; - let a_gpu = a.to(device.clone())?; - let b_gpu = b.to(device.clone())?; + let a_gpu = a.to(gpu_device.clone())?; + let b_gpu = b.to(gpu_device.clone())?; let c_gpu = a_gpu.matmul(&b_gpu)?; c_gpu.resolve()?; - let d_gpu = c_gpu.to(Device::CPU)?; - println!("Ours: {:?}", d_gpu); + let d = c_gpu.to(Device::CPU)?; + println!("{:?}", d); + let a_cpu = a_gpu.to(Device::CPU)?; + println!("{:?}", a_cpu); + let b_cpu = b_gpu.to(Device::CPU)?; + println!("{:?}", b_cpu); Ok(()) } + + #[test] + fn dbg3() -> anyhow::Result<()> { + let cpu_device = Device::request_device(DeviceRequest::CPU)?; + let gpu_device = Device::request_device(DeviceRequest::GPU)?; + let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + + let a_gpu = a.to(gpu_device.clone())?; + let b_gpu = b.to(gpu_device.clone())?; + let c_gpu = a_gpu.matmul(&b_gpu)?; + c_gpu.resolve()?; + let d = c_gpu.to(Device::CPU)?; + println!("{:?}", d); + let a_cpu = a_gpu.to(Device::CPU)?; + println!("{:?}", a_cpu); + let b_cpu = b_gpu.to(Device::CPU)?; + println!("{:?}", b_cpu); + Ok(()) + } + + /* + #[test] + fn test_pyo3() -> anyhow::Result<()> { + let cpu_device = Device::request_device(DeviceRequest::CPU)?; + let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + + let ground: anyhow::Result = Python::with_gil(|py| { + let prg = PyModule::from_code( + py, + r#" + import torch + + def matmul(a, b): + return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() + "#, + "x.py", + "x", + )?; + + let py_a = a.to_py::(&py); + println!("py_a: {:?}", py_a); + let py_b = b.to_py::(&py); + println!("py_b: {:?}", py_b); + + let py_c = prg + .getattr("matmul")? + .call1((py_a, py_b))? + .extract::<&PyArrayDyn>()?; + println!("py_c: {:?}", py_c); + Ok(Tensor::from(py_c)) + }); + let device = Device::request_device(DeviceRequest::GPU)?; + let a_gpu = a.to(device.clone())?; + let b_gpu = b.to(device.clone())?; + let c_gpu = a_gpu.matmul(&b_gpu)?; + c_gpu.resolve()?; + let d_gpu = c_gpu.to(Device::CPU)?; + println!("Ours: {:?}", d_gpu); + Ok(()) + } + */ } From e60da0137bef333a3aafac366c8e38c75cc2c3d3 Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Mon, 22 Jan 2024 16:30:31 +0000 Subject: [PATCH 35/37] chore: no idea --- crates/ratchet-core/src/tensor.rs | 37 +++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index 177e4ae5..7901dd52 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -415,6 +415,7 @@ mod tests { use super::*; + /* #[test] fn dbg() -> anyhow::Result<()> { let cpu_device = Device::request_device(DeviceRequest::CPU)?; @@ -474,6 +475,42 @@ mod tests { println!("{:?}", b_cpu); Ok(()) } + */ + + #[test] + fn dbg4() -> anyhow::Result<()> { + let cpu_device = Device::request_device(DeviceRequest::CPU)?; + let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); + + let ground: anyhow::Result = Python::with_gil(|py| { + let prg = PyModule::from_code( + py, + r#" + import torch + + def matmul(a, b): + return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() + "#, + "x.py", + "x", + )?; + + let py_a = a.to_py::(&py); + println!("py_a: {:?}", py_a); + let py_b = b.to_py::(&py); + println!("py_b: {:?}", py_b); + + let py_c = prg + .getattr("matmul")? + .call1((py_a, py_b))? + .extract::<&PyArrayDyn>()?; + println!("py_c: {:?}", py_c); + Ok(Tensor::from(py_c)) + }); + println!("ground: {:?}", ground); + Ok(()) + } /* #[test] From f84feaf47d579a296fc1ec2e6196cbfe407dcafe Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Mon, 22 Jan 2024 16:33:43 +0000 Subject: [PATCH 36/37] chore: dear god --- crates/ratchet-core/src/tensor.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index 7901dd52..1e206876 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -487,10 +487,10 @@ mod tests { let prg = PyModule::from_code( py, r#" - import torch +import torch - def matmul(a, b): - return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() +def matmul(a, b): + return torch.matmul(torch.from_numpy(a), torch.from_numpy(b)).numpy() "#, "x.py", "x", From 53857f47f1f52b3f15d25a424a00c6bf9a3bb45f Mon Sep 17 00:00:00 2001 From: FL33TW00D Date: Mon, 22 Jan 2024 16:38:40 +0000 Subject: [PATCH 37/37] chore: dear god --- crates/ratchet-core/src/tensor.rs | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/crates/ratchet-core/src/tensor.rs b/crates/ratchet-core/src/tensor.rs index 1e206876..d392db8e 100644 --- a/crates/ratchet-core/src/tensor.rs +++ b/crates/ratchet-core/src/tensor.rs @@ -483,7 +483,7 @@ mod tests { let a = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); let b = Tensor::randn::(shape![1024, 1024], cpu_device.clone()); - let ground: anyhow::Result = Python::with_gil(|py| { + Python::with_gil(|py| { let prg = PyModule::from_code( py, r#" @@ -494,21 +494,15 @@ def matmul(a, b): "#, "x.py", "x", - )?; + ) + .unwrap(); let py_a = a.to_py::(&py); println!("py_a: {:?}", py_a); - let py_b = b.to_py::(&py); - println!("py_b: {:?}", py_b); - - let py_c = prg - .getattr("matmul")? - .call1((py_a, py_b))? - .extract::<&PyArrayDyn>()?; - println!("py_c: {:?}", py_c); - Ok(Tensor::from(py_c)) }); - println!("ground: {:?}", ground); + + let device = Device::request_device(DeviceRequest::GPU)?; + let a_gpu = a.to(device.clone())?; Ok(()) }