use std::fmt;
use crate::grad_checkpoint::selective::SelectiveCheckpointStrategy;
#[derive(Debug, Clone, PartialEq)]
pub enum MemOptError {
InvalidCompressedLength { expected_min: usize, got: usize },
UnsupportedBitWidth(u8),
LengthMismatch { expected: usize, got: usize },
InvalidConfig(String),
}
impl fmt::Display for MemOptError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
MemOptError::InvalidCompressedLength { expected_min, got } => {
write!(
f,
"compressed buffer too short: expected at least {expected_min} bytes, got {got}"
)
},
MemOptError::UnsupportedBitWidth(bits) => {
write!(f, "unsupported activation quantisation bit width: {bits}")
},
MemOptError::LengthMismatch { expected, got } => {
write!(
f,
"decompressed length mismatch: expected {expected} elements, got {got}"
)
},
MemOptError::InvalidConfig(msg) => write!(f, "invalid memory optimiser config: {msg}"),
}
}
}
impl std::error::Error for MemOptError {}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum StatePrecision {
Fp32,
Fp16,
Bf16,
Int8Quantized,
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum ActivationCompression {
None,
Fp16Cast,
Quantized {
bits: u8,
},
Sparse {
threshold: f32,
},
}
#[derive(Debug, Clone)]
pub struct MemoryOptimizationConfig {
pub offload_activations: bool,
pub offload_optimizer_states: bool,
pub use_memory_efficient_attention: bool,
pub gradient_checkpointing: SelectiveCheckpointStrategy,
pub optimizer_state_precision: StatePrecision,
pub activation_compression: ActivationCompression,
pub max_gpu_memory_mb: Option<f32>,
}
impl Default for MemoryOptimizationConfig {
fn default() -> Self {
Self {
offload_activations: false,
offload_optimizer_states: false,
use_memory_efficient_attention: true,
gradient_checkpointing: SelectiveCheckpointStrategy::None,
optimizer_state_precision: StatePrecision::Fp32,
activation_compression: ActivationCompression::None,
max_gpu_memory_mb: Option::None,
}
}
}
#[derive(Debug, Clone)]
pub struct MemoryBudget {
pub total_gpu_memory_mb: f32,
pub model_weights_mb: f32,
pub optimizer_states_mb: f32,
pub activations_mb: f32,
pub gradients_mb: f32,
pub reserved_mb: f32,
}
impl MemoryBudget {
pub fn available_for_activations_mb(&self) -> f32 {
let used =
self.model_weights_mb + self.optimizer_states_mb + self.gradients_mb + self.reserved_mb;
(self.total_gpu_memory_mb - used).max(0.0)
}
pub fn is_feasible(&self) -> bool {
let non_negative = self.total_gpu_memory_mb >= 0.0
&& self.model_weights_mb >= 0.0
&& self.optimizer_states_mb >= 0.0
&& self.activations_mb >= 0.0
&& self.gradients_mb >= 0.0
&& self.reserved_mb >= 0.0;
if !non_negative {
return false;
}
let total_used = self.model_weights_mb
+ self.optimizer_states_mb
+ self.activations_mb
+ self.gradients_mb
+ self.reserved_mb;
total_used <= self.total_gpu_memory_mb
}
}
pub fn estimate_model_memory(num_params: u64, dtype_bytes: u32) -> f32 {
(num_params as f64 * dtype_bytes as f64 / (1024.0 * 1024.0)) as f32
}
pub fn estimate_optimizer_state_memory(num_params: u64, precision: StatePrecision) -> f32 {
let bytes_per_param: u64 = match precision {
StatePrecision::Fp32 => 8,
StatePrecision::Fp16 | StatePrecision::Bf16 => 4,
StatePrecision::Int8Quantized => 1,
};
(num_params as f64 * bytes_per_param as f64 / (1024.0 * 1024.0)) as f32
}
pub fn estimate_activation_memory(
batch_size: usize,
seq_len: usize,
hidden_size: usize,
num_layers: usize,
checkpoint_ratio: f32,
) -> f32 {
let baseline = batch_size as f64
* seq_len as f64
* hidden_size as f64
* 4.0 * num_layers as f64
* 2.0; let saved_fraction = (1.0 - checkpoint_ratio as f64).max(0.0);
(baseline * saved_fraction / (1024.0 * 1024.0)) as f32
}
pub struct MemoryOptimizer {
pub config: MemoryOptimizationConfig,
}
impl MemoryOptimizer {
pub fn new(config: MemoryOptimizationConfig) -> Self {
Self { config }
}
pub fn compute_memory_budget(
num_params: u64,
batch_size: usize,
seq_len: usize,
hidden_size: usize,
num_layers: usize,
total_gpu_mb: f32,
) -> MemoryBudget {
let model_weights_mb = estimate_model_memory(num_params, 4); let optimizer_states_mb = estimate_optimizer_state_memory(num_params, StatePrecision::Fp32);
let activations_mb =
estimate_activation_memory(batch_size, seq_len, hidden_size, num_layers, 0.0);
let gradients_mb = model_weights_mb; let reserved_mb = 512.0;
MemoryBudget {
total_gpu_memory_mb: total_gpu_mb,
model_weights_mb,
optimizer_states_mb,
activations_mb,
gradients_mb,
reserved_mb,
}
}
pub fn recommend_strategy(
budget: &MemoryBudget,
total_gpu_mb: f32,
) -> MemoryOptimizationConfig {
if budget.is_feasible() {
return MemoryOptimizationConfig::default();
}
let total_used = budget.model_weights_mb
+ budget.optimizer_states_mb
+ budget.activations_mb
+ budget.gradients_mb
+ budget.reserved_mb;
let over_ratio = if total_gpu_mb > 0.0 {
(total_used - total_gpu_mb) / total_gpu_mb
} else {
1.0
};
if over_ratio < 0.25 {
MemoryOptimizationConfig {
gradient_checkpointing: SelectiveCheckpointStrategy::EveryNthLayer { n: 2 },
..MemoryOptimizationConfig::default()
}
} else if over_ratio < 0.50 {
MemoryOptimizationConfig {
gradient_checkpointing: SelectiveCheckpointStrategy::EveryNthLayer { n: 2 },
activation_compression: ActivationCompression::Fp16Cast,
..MemoryOptimizationConfig::default()
}
} else {
MemoryOptimizationConfig {
gradient_checkpointing: SelectiveCheckpointStrategy::Full,
activation_compression: ActivationCompression::Fp16Cast,
offload_activations: true,
..MemoryOptimizationConfig::default()
}
}
}
pub fn compress_activation(activation: &[f32], method: ActivationCompression) -> Vec<u8> {
match method {
ActivationCompression::None => {
let mut out = Vec::with_capacity(activation.len() * 4);
for &v in activation {
out.extend_from_slice(&v.to_le_bytes());
}
out
},
ActivationCompression::Fp16Cast => {
let mut out = Vec::with_capacity(activation.len() * 2);
for &v in activation {
let bf16_bits = f32_to_bf16_bits(v);
out.extend_from_slice(&bf16_bits.to_le_bytes());
}
out
},
ActivationCompression::Quantized { bits } => compress_quantized(activation, bits),
ActivationCompression::Sparse { threshold } => compress_sparse(activation, threshold),
}
}
pub fn decompress_activation(
compressed: &[u8],
original_len: usize,
method: ActivationCompression,
) -> Result<Vec<f32>, MemOptError> {
match method {
ActivationCompression::None => decompress_none(compressed, original_len),
ActivationCompression::Fp16Cast => decompress_fp16cast(compressed, original_len),
ActivationCompression::Quantized { bits } => {
decompress_quantized(compressed, original_len, bits)
},
ActivationCompression::Sparse { .. } => decompress_sparse(compressed, original_len),
}
}
}
#[inline]
fn f32_to_bf16_bits(v: f32) -> u16 {
let bits = v.to_bits();
let rounding_bias = 0x7FFF_u32 + ((bits >> 16) & 1);
((bits.wrapping_add(rounding_bias)) >> 16) as u16
}
#[inline]
fn bf16_bits_to_f32(bits: u16) -> f32 {
f32::from_bits((bits as u32) << 16)
}
fn decompress_none(compressed: &[u8], original_len: usize) -> Result<Vec<f32>, MemOptError> {
let expected_bytes = original_len * 4;
if compressed.len() < expected_bytes {
return Err(MemOptError::InvalidCompressedLength {
expected_min: expected_bytes,
got: compressed.len(),
});
}
let mut out = Vec::with_capacity(original_len);
for chunk in compressed[..expected_bytes].chunks_exact(4) {
let arr: [u8; 4] = [chunk[0], chunk[1], chunk[2], chunk[3]];
out.push(f32::from_le_bytes(arr));
}
Ok(out)
}
fn decompress_fp16cast(compressed: &[u8], original_len: usize) -> Result<Vec<f32>, MemOptError> {
let expected_bytes = original_len * 2;
if compressed.len() < expected_bytes {
return Err(MemOptError::InvalidCompressedLength {
expected_min: expected_bytes,
got: compressed.len(),
});
}
let mut out = Vec::with_capacity(original_len);
for chunk in compressed[..expected_bytes].chunks_exact(2) {
let bits = u16::from_le_bytes([chunk[0], chunk[1]]);
out.push(bf16_bits_to_f32(bits));
}
Ok(out)
}
fn compress_quantized(activation: &[f32], bits: u8) -> Vec<u8> {
if activation.is_empty() || bits == 0 {
let mut out = Vec::with_capacity(12);
out.extend_from_slice(&0_f32.to_le_bytes());
out.extend_from_slice(&0_f32.to_le_bytes());
out.extend_from_slice(&(0_u32).to_le_bytes());
return out;
}
let bits = bits.min(8);
let min_val = activation.iter().cloned().fold(f32::INFINITY, f32::min);
let max_val = activation.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
let range = max_val - min_val;
let levels = ((1u32 << bits) - 1) as f32;
let n = activation.len();
let packed_bytes = (n * bits as usize).div_ceil(8);
let mut out = Vec::with_capacity(12 + packed_bytes);
out.extend_from_slice(&min_val.to_le_bytes());
out.extend_from_slice(&max_val.to_le_bytes());
out.extend_from_slice(&(n as u32).to_le_bytes());
let mut bit_buf: u64 = 0;
let mut bits_in_buf: u32 = 0;
for &v in activation {
let q = if range < f32::EPSILON {
0_u64
} else {
((v - min_val) / range * levels).round() as u64
};
bit_buf |= q << bits_in_buf;
bits_in_buf += bits as u32;
while bits_in_buf >= 8 {
out.push((bit_buf & 0xFF) as u8);
bit_buf >>= 8;
bits_in_buf -= 8;
}
}
if bits_in_buf > 0 {
out.push((bit_buf & 0xFF) as u8);
}
out
}
fn decompress_quantized(
compressed: &[u8],
original_len: usize,
bits: u8,
) -> Result<Vec<f32>, MemOptError> {
if bits == 0 || bits > 8 {
return Err(MemOptError::UnsupportedBitWidth(bits));
}
if compressed.len() < 12 {
return Err(MemOptError::InvalidCompressedLength {
expected_min: 12,
got: compressed.len(),
});
}
let min_val = f32::from_le_bytes([compressed[0], compressed[1], compressed[2], compressed[3]]);
let max_val = f32::from_le_bytes([compressed[4], compressed[5], compressed[6], compressed[7]]);
let stored_n =
u32::from_le_bytes([compressed[8], compressed[9], compressed[10], compressed[11]]) as usize;
if stored_n != original_len {
return Err(MemOptError::LengthMismatch {
expected: original_len,
got: stored_n,
});
}
let levels = ((1u32 << bits) - 1) as f32;
let range = max_val - min_val;
let payload = &compressed[12..];
let mut out = Vec::with_capacity(original_len);
let mask = (1u64 << bits) - 1;
let mut bit_buf: u64 = 0;
let mut bits_in_buf: u32 = 0;
let mut byte_idx = 0;
for _ in 0..original_len {
while bits_in_buf < bits as u32 {
if byte_idx < payload.len() {
bit_buf |= (payload[byte_idx] as u64) << bits_in_buf;
byte_idx += 1;
}
bits_in_buf += 8;
}
let q = bit_buf & mask;
bit_buf >>= bits as u32;
bits_in_buf -= bits as u32;
let v = if levels < f32::EPSILON {
min_val
} else {
min_val + (q as f32 / levels) * range
};
out.push(v);
}
Ok(out)
}
fn compress_sparse(activation: &[f32], threshold: f32) -> Vec<u8> {
let n = activation.len();
let pairs: Vec<(u32, f32)> = activation
.iter()
.enumerate()
.filter_map(
|(i, &v)| {
if v.abs() > threshold {
Some((i as u32, v))
} else {
Option::None
}
},
)
.collect();
let mut out = Vec::with_capacity(4 + 8 * pairs.len());
out.extend_from_slice(&(n as u32).to_le_bytes());
for (idx, val) in pairs {
out.extend_from_slice(&idx.to_le_bytes());
out.extend_from_slice(&val.to_le_bytes());
}
out
}
fn decompress_sparse(compressed: &[u8], original_len: usize) -> Result<Vec<f32>, MemOptError> {
if compressed.len() < 4 {
return Err(MemOptError::InvalidCompressedLength {
expected_min: 4,
got: compressed.len(),
});
}
let stored_n =
u32::from_le_bytes([compressed[0], compressed[1], compressed[2], compressed[3]]) as usize;
if stored_n != original_len {
return Err(MemOptError::LengthMismatch {
expected: original_len,
got: stored_n,
});
}
let mut out = vec![0.0_f32; original_len];
let payload = &compressed[4..];
let mut offset = 0;
while offset + 8 <= payload.len() {
let idx = u32::from_le_bytes([
payload[offset],
payload[offset + 1],
payload[offset + 2],
payload[offset + 3],
]) as usize;
let val = f32::from_le_bytes([
payload[offset + 4],
payload[offset + 5],
payload[offset + 6],
payload[offset + 7],
]);
if idx < original_len {
out[idx] = val;
}
offset += 8;
}
Ok(out)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_model_memory_estimate() {
let mb = estimate_model_memory(1_000_000, 4);
let expected = 1_000_000.0 * 4.0 / (1024.0 * 1024.0);
assert!((mb - expected).abs() < 0.01, "got {mb}");
}
#[test]
fn test_optimizer_state_memory_fp16_half_fp32() {
let fp32_mb = estimate_optimizer_state_memory(1_000_000, StatePrecision::Fp32);
let fp16_mb = estimate_optimizer_state_memory(1_000_000, StatePrecision::Fp16);
assert!(
(fp32_mb / fp16_mb - 2.0).abs() < 0.01,
"fp32={fp32_mb} fp16={fp16_mb}"
);
}
#[test]
fn test_optimizer_state_memory_values() {
let n = 1_000_000_u64;
let fp32 = estimate_optimizer_state_memory(n, StatePrecision::Fp32);
let expected = 8.0 * 1_000_000.0 / (1024.0 * 1024.0);
assert!((fp32 - expected).abs() < 0.01);
let int8 = estimate_optimizer_state_memory(n, StatePrecision::Int8Quantized);
assert!(int8 < 1.1 && int8 > 0.9, "int8={int8}");
let bf16 = estimate_optimizer_state_memory(n, StatePrecision::Bf16);
let fp16 = estimate_optimizer_state_memory(n, StatePrecision::Fp16);
assert!((bf16 - fp16).abs() < 0.001);
}
#[test]
fn test_activation_memory_with_checkpointing() {
let no_ckpt = estimate_activation_memory(4, 512, 768, 12, 0.0);
let full_ckpt = estimate_activation_memory(4, 512, 768, 12, 1.0);
assert!(
full_ckpt.abs() < 1e-6,
"full_ckpt should be 0, got {full_ckpt}"
);
let half_ckpt = estimate_activation_memory(4, 512, 768, 12, 0.5);
assert!((half_ckpt - no_ckpt * 0.5).abs() < 0.01);
}
#[test]
fn test_is_feasible() {
let feasible = MemoryBudget {
total_gpu_memory_mb: 40_960.0,
model_weights_mb: 7_000.0,
optimizer_states_mb: 14_000.0,
activations_mb: 4_000.0,
gradients_mb: 7_000.0,
reserved_mb: 512.0,
};
assert!(feasible.is_feasible());
let infeasible = MemoryBudget {
total_gpu_memory_mb: 10_000.0,
model_weights_mb: 7_000.0,
optimizer_states_mb: 14_000.0,
activations_mb: 4_000.0,
gradients_mb: 7_000.0,
reserved_mb: 512.0,
};
assert!(!infeasible.is_feasible());
}
#[test]
fn test_available_for_activations_mb() {
let budget = MemoryBudget {
total_gpu_memory_mb: 40_960.0,
model_weights_mb: 7_000.0,
optimizer_states_mb: 14_000.0,
activations_mb: 4_000.0,
gradients_mb: 7_000.0,
reserved_mb: 512.0,
};
let avail = budget.available_for_activations_mb();
assert!((avail - 12_448.0).abs() < 0.01, "got {avail}");
}
#[test]
fn test_recommend_strategy_feasible() {
let budget = MemoryBudget {
total_gpu_memory_mb: 80_000.0,
model_weights_mb: 1_000.0,
optimizer_states_mb: 2_000.0,
activations_mb: 500.0,
gradients_mb: 1_000.0,
reserved_mb: 512.0,
};
let config = MemoryOptimizer::recommend_strategy(&budget, 80_000.0);
assert_eq!(
config.gradient_checkpointing,
SelectiveCheckpointStrategy::None
);
assert!(!config.offload_activations);
}
#[test]
fn test_recommend_strategy_slightly_over() {
let budget = MemoryBudget {
total_gpu_memory_mb: 10_000.0,
model_weights_mb: 4_000.0,
optimizer_states_mb: 5_000.0,
activations_mb: 2_000.0,
gradients_mb: 512.0,
reserved_mb: 0.0,
};
let config = MemoryOptimizer::recommend_strategy(&budget, 10_000.0);
assert_eq!(
config.gradient_checkpointing,
SelectiveCheckpointStrategy::EveryNthLayer { n: 2 }
);
assert!(!config.offload_activations);
}
#[test]
fn test_recommend_strategy_severely_over() {
let budget = MemoryBudget {
total_gpu_memory_mb: 10_000.0,
model_weights_mb: 8_000.0,
optimizer_states_mb: 8_000.0,
activations_mb: 4_000.0,
gradients_mb: 0.0,
reserved_mb: 0.0,
};
let config = MemoryOptimizer::recommend_strategy(&budget, 10_000.0);
assert!(config.offload_activations);
}
#[test]
fn test_compress_decompress_fp16cast_roundtrip() {
let original: Vec<f32> = vec![1.0, -2.5, 0.0, std::f32::consts::PI, 100.0, -0.001];
let compressed =
MemoryOptimizer::compress_activation(&original, ActivationCompression::Fp16Cast);
assert_eq!(compressed.len(), original.len() * 2);
let decompressed = MemoryOptimizer::decompress_activation(
&compressed,
original.len(),
ActivationCompression::Fp16Cast,
)
.expect("decompression failed");
assert_eq!(decompressed.len(), original.len());
for (orig, decomp) in original.iter().zip(decompressed.iter()) {
let rel_err = if orig.abs() > 1e-6 {
(orig - decomp).abs() / orig.abs()
} else {
(orig - decomp).abs()
};
assert!(
rel_err < 0.02,
"orig={orig} decomp={decomp} rel_err={rel_err}"
);
}
}
#[test]
fn test_compress_none_is_f32_bytes() {
let original: Vec<f32> = vec![1.0_f32, 2.0, 3.0];
let compressed =
MemoryOptimizer::compress_activation(&original, ActivationCompression::None);
assert_eq!(compressed.len(), 12);
let decompressed =
MemoryOptimizer::decompress_activation(&compressed, 3, ActivationCompression::None)
.expect("ok");
assert_eq!(decompressed, original);
}
#[test]
fn test_compress_decompress_quantized_roundtrip() {
let original: Vec<f32> = (0..16).map(|i| i as f32 * 0.5).collect();
let method = ActivationCompression::Quantized { bits: 8 };
let compressed = MemoryOptimizer::compress_activation(&original, method);
let decompressed =
MemoryOptimizer::decompress_activation(&compressed, original.len(), method)
.expect("ok");
assert_eq!(decompressed.len(), original.len());
for (o, d) in original.iter().zip(decompressed.iter()) {
assert!((o - d).abs() < 0.1, "orig={o} decomp={d}");
}
}
#[test]
fn test_compress_decompress_sparse_roundtrip() {
let original: Vec<f32> = vec![0.0, 0.0, 3.0, 0.0, -5.0, 0.01, 0.0];
let method = ActivationCompression::Sparse { threshold: 0.05 };
let compressed = MemoryOptimizer::compress_activation(&original, method);
let decompressed =
MemoryOptimizer::decompress_activation(&compressed, original.len(), method)
.expect("ok");
assert_eq!(decompressed.len(), original.len());
assert!((decompressed[2] - 3.0).abs() < 1e-5);
assert!((decompressed[4] - (-5.0)).abs() < 1e-5);
assert_eq!(decompressed[5], 0.0);
}
#[test]
fn test_empty_activation_compression() {
let empty: Vec<f32> = vec![];
for method in [
ActivationCompression::None,
ActivationCompression::Fp16Cast,
ActivationCompression::Quantized { bits: 4 },
ActivationCompression::Sparse { threshold: 0.0 },
] {
let compressed = MemoryOptimizer::compress_activation(&empty, method);
let decompressed =
MemoryOptimizer::decompress_activation(&compressed, 0, method).unwrap_or_default();
assert!(
decompressed.is_empty(),
"method {method:?} produced non-empty output for empty input"
);
}
}
#[test]
fn test_multi_config_validation() {
let config = MemoryOptimizationConfig {
offload_activations: true,
offload_optimizer_states: true,
use_memory_efficient_attention: true,
gradient_checkpointing: SelectiveCheckpointStrategy::Full,
optimizer_state_precision: StatePrecision::Int8Quantized,
activation_compression: ActivationCompression::Fp16Cast,
max_gpu_memory_mb: Some(24_576.0),
};
assert!(config.offload_activations);
assert!(config.offload_optimizer_states);
assert!(config.use_memory_efficient_attention);
assert_eq!(
config.gradient_checkpointing,
SelectiveCheckpointStrategy::Full
);
assert_eq!(
config.optimizer_state_precision,
StatePrecision::Int8Quantized
);
assert_eq!(config.max_gpu_memory_mb, Some(24_576.0));
}
#[test]
fn test_compute_memory_budget_sanity() {
let budget = MemoryOptimizer::compute_memory_budget(
7_000_000_000, 1, 512, 4096, 32, 80_000.0, );
assert!(budget.model_weights_mb > 0.0);
assert!(budget.optimizer_states_mb > 0.0);
assert!(budget.activations_mb >= 0.0);
assert_eq!(budget.gradients_mb, budget.model_weights_mb);
}
}