From 500eb1ac89e9c5a1251b794616cf41f74e5f471f Mon Sep 17 00:00:00 2001 From: Ivar Flakstad <69173633+ivarflakstad@users.noreply.github.com> Date: Wed, 28 Aug 2024 14:13:38 +0200 Subject: [PATCH] stash for debugging --- crates/ratchet-core/src/dtype/blocks.rs | 28 ++++- crates/ratchet-core/src/quant.rs | 143 +++++++++++++++++++----- 2 files changed, 143 insertions(+), 28 deletions(-) diff --git a/crates/ratchet-core/src/dtype/blocks.rs b/crates/ratchet-core/src/dtype/blocks.rs index b3c99ff4..157f22f4 100644 --- a/crates/ratchet-core/src/dtype/blocks.rs +++ b/crates/ratchet-core/src/dtype/blocks.rs @@ -3,7 +3,7 @@ /// /// We closely follow the memory layout of the original GGUF implementation, /// but often need 2 variants of each block type for devices that don't support f16. -use crate::{rvec, Align, BufferSegment, RVec, TensorDType}; +use crate::{rvec, Align, BufferSegment, DType, RVec, TensorDType}; use derive_new::new; use half::f16; use num_traits::{AsPrimitive, Float, FromPrimitive}; @@ -169,24 +169,46 @@ pub trait Quantized { type FP: TensorDType + Float + AsPrimitive + FromPrimitive + Copy + PartialEq; const PACK_SIZE: usize; const GROUP_SIZE: usize; + + const LSHIFT: usize = Self::GROUP_SIZE / Self::PACK_SIZE; + const MASK: i32 = (1 << Self::LSHIFT) - 1; + const RSHIFT: usize = Self::GROUP_SIZE - Self::LSHIFT; + + fn dt() -> DType; } impl Quantized for Q8_0F { type FP = f32; const PACK_SIZE: usize = 4; const GROUP_SIZE: usize = 32; + + fn dt() -> DType { + DType::Q8_0F(Q8_0F::default()) + } } impl Quantized for Q8_0H { type FP = f16; const PACK_SIZE: usize = 4; const GROUP_SIZE: usize = 32; + + fn dt() -> DType { + DType::Q8_0H(Q8_0H::default()) + } } impl Quantized for Q4_KF { type FP = f32; const PACK_SIZE: usize = 8; - const GROUP_SIZE: usize = 8; + const GROUP_SIZE: usize = 32; + + fn dt() -> DType { + DType::Q4_KF(Q4_KF::default()) + } } impl Quantized for Q4_KH { type FP = f16; const PACK_SIZE: usize = 8; - const GROUP_SIZE: usize = 8; + const GROUP_SIZE: usize = 32; + + fn dt() -> DType { + DType::Q4_KH(Q4_KH::default()) + } } diff --git a/crates/ratchet-core/src/quant.rs b/crates/ratchet-core/src/quant.rs index 67dda2ef..bba0e82f 100644 --- a/crates/ratchet-core/src/quant.rs +++ b/crates/ratchet-core/src/quant.rs @@ -4,7 +4,7 @@ use num_traits::{AsPrimitive, Float, FromPrimitive, Zero}; use std::fmt::Debug; use crate::{ - dtype::Quantized, gpu::STORAGE_BUFFER_ALIGN, DType, Device, Tensor, TensorDType, Q8_0F, + dtype::Quantized, gpu::STORAGE_BUFFER_ALIGN, DType, Device, Tensor, Q4_KF, Q4_KH, Q8_0F, Q8_0H, }; /// Quantizer @@ -15,28 +15,30 @@ pub struct Quantizer { format: Quantization, } -fn quantize_inner(matrix: &[Q::FP], elements: usize) -> Vec { +#[inline] +fn storage_align(n: usize) -> usize { + let size_t = core::mem::size_of::(); + let nbytes = n * size_t; + let aligned = if nbytes % STORAGE_BUFFER_ALIGN != 0 { + nbytes + STORAGE_BUFFER_ALIGN - nbytes % STORAGE_BUFFER_ALIGN + } else { + nbytes + }; + aligned / size_t +} + +pub fn quantize_inner(matrix: &[Q::FP], elements: usize) -> Vec { + println!("quantize_inner"); assert_eq!(elements % Q::PACK_SIZE, 0); assert_eq!(elements % Q::GROUP_SIZE, 0); let qmatrix_len = elements / Q::PACK_SIZE; let amatrix_len = elements / Q::GROUP_SIZE; - //returns the aligned number of ELEMENTS - let aligner = |numel: usize, size_t: usize| -> usize { - let nbytes = numel * size_t; - let aligned = if nbytes % STORAGE_BUFFER_ALIGN != 0 { - nbytes + STORAGE_BUFFER_ALIGN - nbytes % STORAGE_BUFFER_ALIGN - } else { - nbytes - }; - aligned / size_t - }; - - let mut quantized_matrix = vec![0u32; aligner(qmatrix_len, std::mem::size_of::())]; - let mut absmax_matrix = vec![Q::FP::zero(); aligner(amatrix_len, std::mem::size_of::())]; - + let mut quantized_matrix = vec![0u32; storage_align::(qmatrix_len)]; + let mut absmax_matrix = vec![Q::FP::zero(); storage_align::(amatrix_len)]; let mut block_absmax = Q::FP::neg_infinity(); + for i in (0..elements).step_by(Q::PACK_SIZE) { if i % Q::GROUP_SIZE == 0 { let amax = matrix[i..i + Q::GROUP_SIZE] @@ -47,7 +49,7 @@ fn quantize_inner(matrix: &[Q::FP], elements: usize) -> Vec { } for j in 0..Q::PACK_SIZE { let packed_value: i32 = - ((matrix[i + j] / block_absmax).round().as_() & 0xFF) << (j * 8); + ((matrix[i + j] / block_absmax).round().as_() & Q::MASK) << (j * Q::LSHIFT); quantized_matrix[i / Q::PACK_SIZE] |= packed_value as u32; } absmax_matrix[i / Q::GROUP_SIZE] = block_absmax; @@ -58,8 +60,8 @@ fn quantize_inner(matrix: &[Q::FP], elements: usize) -> Vec { } pub fn quantize(tensor: &Tensor) -> Tensor { - return match tensor.dt() { - DType::F32 => { + match (tensor.dt(), Q::dt()) { + (DType::F32, DType::Q8_0F(_)) => { let matrix = tensor.to_vec::().unwrap(); unsafe { Tensor::from_quantized( @@ -70,11 +72,45 @@ pub fn quantize(tensor: &Tensor) -> Tensor { ) } } - dt => panic!("Unsupported dtype {dt}"), - }; + (DType::F32, DType::Q4_KF(_)) => { + let matrix = tensor.to_vec::().unwrap(); + unsafe { + Tensor::from_quantized( + quantize_inner::(&matrix, tensor.shape().numel()), + DType::Q4_KF(Q4_KF::default()), + tensor.shape().clone(), + Device::CPU, + ) + } + } + (DType::F16, DType::Q8_0H(_)) => { + let matrix = tensor.to_vec::().unwrap(); + unsafe { + Tensor::from_quantized( + quantize_inner::(&matrix, tensor.shape().numel()), + DType::Q8_0H(Q8_0H::default()), + tensor.shape().clone(), + Device::CPU, + ) + } + } + (DType::F16, DType::Q4_KH(_)) => { + let matrix = tensor.to_vec::().unwrap(); + unsafe { + Tensor::from_quantized( + quantize_inner::(&matrix, tensor.shape().numel()), + DType::Q4_KH(Q4_KH::default()), + tensor.shape().clone(), + Device::CPU, + ) + } + } + (dt, qdt) => panic!("Unsupported dtype combination {dt}, {qdt}"), + } } fn dequantize_inner(quantized: &[u8], numel: usize) -> Vec { + println!("dequantize_inner"); let num_q = numel / Q::PACK_SIZE; let num_q_bytes = num_q * std::mem::size_of::(); let aligner = |numel: usize, size_t: usize| -> usize { @@ -101,9 +137,11 @@ fn dequantize_inner(quantized: &[u8], numel: usize) -> Vec let block_absmax = absmax_matrix[div_floor(i, Q::GROUP_SIZE)]; let packed_value = quantized_matrix[div_floor(i, Q::PACK_SIZE)] as i32; for j in 0..Q::PACK_SIZE { - dequantized[i + j] = - Q::FP::from_i32((packed_value << (8 * (Q::PACK_SIZE - j - 1))) >> 24).unwrap() - * block_absmax; + dequantized[i + j] = Q::FP::from_i32( + (packed_value << (Q::LSHIFT * (Q::PACK_SIZE - j - 1))) >> Q::RSHIFT, + ) + .unwrap() + * block_absmax; } } @@ -119,6 +157,27 @@ pub fn dequantize(quantized: Tensor) -> Tensor { let dequantized = dequantize_inner::(&raw_bytes, elements); Tensor::from_data(&dequantized, original_shape, Device::CPU) } + DType::Q4_KF(_) => { + let elements = quantized.shape().numel(); + let original_shape = quantized.shape().clone(); + let raw_bytes = unsafe { quantized.into_bytes().unwrap() }; + let dequantized = dequantize_inner::(&raw_bytes, elements); + Tensor::from_data(&dequantized, original_shape, Device::CPU) + } + DType::Q8_0H(_) => { + let elements = quantized.shape().numel(); + let original_shape = quantized.shape().clone(); + let raw_bytes = unsafe { quantized.into_bytes().unwrap() }; + let dequantized = dequantize_inner::(&raw_bytes, elements); + Tensor::from_data(&dequantized, original_shape, Device::CPU) + } + DType::Q4_KH(_) => { + let elements = quantized.shape().numel(); + let original_shape = quantized.shape().clone(); + let raw_bytes = unsafe { quantized.into_bytes().unwrap() }; + let dequantized = dequantize_inner::(&raw_bytes, elements); + Tensor::from_data(&dequantized, original_shape, Device::CPU) + } dt => panic!("Unsupported dtype {dt}"), }; } @@ -299,7 +358,10 @@ impl Quantization { #[cfg(test)] mod tests { - use crate::{dequantize, quantize, shape, Device, Quantization, Quantizer, Tensor, Q8_0F}; + use crate::{ + dequantize, quantize, quantize_inner, shape, Device, Quantization, Quantizer, Tensor, + Q4_KF, Q8_0F, + }; #[test] pub fn test_sint8_qdq() { @@ -317,7 +379,38 @@ mod tests { let q1_raw = unsafe { q1.deep_clone().into_bytes().unwrap() }; let q2_raw = unsafe { q2.deep_clone().into_bytes().unwrap() }; assert_eq!(q1_raw, q2_raw); + if q1_raw == q2_raw { + println!("SInt8 quantization is correct"); + } dq1.all_close(&dq2, 1e-3, 1e-3).unwrap(); } + + #[test] + pub fn test_sint4_qdq() { + let ground = Tensor::randn::(shape![64, 64], Device::CPU); + + // Old api + let data = ground.to_vec::().unwrap(); + let (q1, absmax) = Quantizer::sint4_quantize::(&data, 64, 64); + let dq1 = Quantizer::sint4_dequantize(&q1, absmax, 64, 64); + + // New api + let q2 = quantize_inner::(&data, 64 * 64); + //let dq2 = dequantize(q2.deep_clone()); + + for (a, b) in q1.iter().zip(q2.iter()) { + if a != b { + println!("{} {}", a, b); + } + } + /* + let dq2_vec = dq2.to_vec::().unwrap(); + for (a, b) in dq1.iter().zip(dq2_vec.iter()) { + if (a - b).abs() >= 1e-3 { + println!("{} {}", a, b); + } + } + */ + } }