use std::fmt;
#[derive(Debug, Clone, PartialEq)]
pub enum GradCheckpointError {
EmptyLayers,
InvalidPolicy(String),
LayerIndexOutOfBounds(usize),
BudgetTooSmall { budget: usize, minimum: usize },
}
impl fmt::Display for GradCheckpointError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
GradCheckpointError::EmptyLayers => write!(f, "layer list is empty"),
GradCheckpointError::InvalidPolicy(msg) => write!(f, "invalid policy: {msg}"),
GradCheckpointError::LayerIndexOutOfBounds(idx) => {
write!(f, "layer index {idx} is out of bounds")
},
GradCheckpointError::BudgetTooSmall { budget, minimum } => write!(
f,
"memory budget {budget} bytes is smaller than the minimum required {minimum} bytes"
),
}
}
}
impl std::error::Error for GradCheckpointError {}
#[derive(Debug, Clone, PartialEq)]
pub enum CheckpointPolicy {
EveryN(usize),
SpecificLayers(Vec<usize>),
MemoryThreshold(usize),
Optimal { target_memory_bytes: usize },
}
pub struct GradCheckpointPlanner {
pub num_layers: usize,
pub policy: CheckpointPolicy,
pub layer_activation_sizes: Vec<usize>,
}
impl GradCheckpointPlanner {
pub fn new(num_layers: usize, policy: CheckpointPolicy) -> Self {
Self {
num_layers,
policy,
layer_activation_sizes: vec![0; num_layers],
}
}
pub fn with_sizes(
num_layers: usize,
policy: CheckpointPolicy,
layer_activation_sizes: Vec<usize>,
) -> Self {
Self {
num_layers,
policy,
layer_activation_sizes,
}
}
pub fn should_checkpoint_layer(&self, layer_idx: usize) -> bool {
if layer_idx >= self.num_layers {
return false;
}
match &self.policy {
CheckpointPolicy::EveryN(n) => {
if *n == 0 {
false
} else {
layer_idx.is_multiple_of(*n)
}
},
CheckpointPolicy::SpecificLayers(indices) => indices.contains(&layer_idx),
CheckpointPolicy::MemoryThreshold(threshold) => {
let size = self.layer_activation_sizes.get(layer_idx).copied().unwrap_or(0);
size >= *threshold
},
CheckpointPolicy::Optimal {
target_memory_bytes,
} => {
match Self::compute_optimal_checkpoints(
&self.layer_activation_sizes
[..self.num_layers.min(self.layer_activation_sizes.len())],
*target_memory_bytes,
) {
Ok(checkpointed) => checkpointed.contains(&layer_idx),
Err(_) => false,
}
},
}
}
pub fn total_memory_saved(&self) -> usize {
(0..self.num_layers)
.filter(|&i| self.should_checkpoint_layer(i))
.map(|i| self.layer_activation_sizes.get(i).copied().unwrap_or(0))
.sum()
}
pub fn recompute_cost(&self) -> usize {
(0..self.num_layers).filter(|&i| self.should_checkpoint_layer(i)).count()
}
pub fn compute_optimal_checkpoints(
layer_sizes: &[usize],
budget_bytes: usize,
) -> Result<Vec<usize>, GradCheckpointError> {
if layer_sizes.is_empty() {
return Ok(Vec::new());
}
let max_layer = layer_sizes.iter().copied().max().unwrap_or(0);
if max_layer > budget_bytes {
return Err(GradCheckpointError::BudgetTooSmall {
budget: budget_bytes,
minimum: max_layer,
});
}
let mut checkpointed: Vec<usize> = Vec::new();
let mut segment_sum: usize = 0;
for (i, &size) in layer_sizes.iter().enumerate() {
if segment_sum + size > budget_bytes {
checkpointed.push(i);
segment_sum = 0;
} else {
segment_sum += size;
}
}
Ok(checkpointed)
}
}
#[derive(Debug, Clone)]
pub struct LayerActivation {
pub layer_idx: usize,
pub size_bytes: usize,
pub is_checkpointed: bool,
}
pub struct ActivationBuffer {
pub max_size: usize,
pub current_size: usize,
pub layers: Vec<LayerActivation>,
}
impl ActivationBuffer {
pub fn new(max_size: usize) -> Self {
Self {
max_size,
current_size: 0,
layers: Vec::new(),
}
}
pub fn push_layer(
&mut self,
layer_idx: usize,
size_bytes: usize,
checkpointed: bool,
) -> Result<(), GradCheckpointError> {
if self.layers.iter().any(|l| l.layer_idx == layer_idx) {
return Err(GradCheckpointError::LayerIndexOutOfBounds(layer_idx));
}
if !checkpointed {
self.current_size = self.current_size.saturating_add(size_bytes);
}
self.layers.push(LayerActivation {
layer_idx,
size_bytes,
is_checkpointed: checkpointed,
});
Ok(())
}
pub fn evict_layer(&mut self, layer_idx: usize) -> Result<(), GradCheckpointError> {
let pos = self
.layers
.iter()
.position(|l| l.layer_idx == layer_idx)
.ok_or(GradCheckpointError::LayerIndexOutOfBounds(layer_idx))?;
let layer = self.layers.remove(pos);
if !layer.is_checkpointed {
self.current_size = self.current_size.saturating_sub(layer.size_bytes);
}
Ok(())
}
pub fn memory_pressure(&self) -> f32 {
if self.max_size == 0 {
return 0.0;
}
(self.current_size as f32 / self.max_size as f32).clamp(0.0, 1.0)
}
}
#[cfg(test)]
mod tests {
use super::*;
fn planner_with_sizes(sizes: Vec<usize>, policy: CheckpointPolicy) -> GradCheckpointPlanner {
let n = sizes.len();
GradCheckpointPlanner::with_sizes(n, policy, sizes)
}
#[test]
fn test_planner_every_n2_on_8_layers() {
let planner = GradCheckpointPlanner::new(8, CheckpointPolicy::EveryN(2));
for i in 0..8_usize {
let expected = i % 2 == 0;
assert_eq!(
planner.should_checkpoint_layer(i),
expected,
"layer {i}: expected checkpoint={expected}"
);
}
}
#[test]
fn test_planner_every_n3() {
let planner = GradCheckpointPlanner::new(9, CheckpointPolicy::EveryN(3));
assert!(planner.should_checkpoint_layer(0));
assert!(planner.should_checkpoint_layer(3));
assert!(planner.should_checkpoint_layer(6));
assert!(!planner.should_checkpoint_layer(1));
assert!(!planner.should_checkpoint_layer(2));
assert!(!planner.should_checkpoint_layer(4));
assert!(!planner.should_checkpoint_layer(5));
}
#[test]
fn test_planner_every_n0_no_checkpoint() {
let planner = GradCheckpointPlanner::new(6, CheckpointPolicy::EveryN(0));
for i in 0..6 {
assert!(
!planner.should_checkpoint_layer(i),
"n=0 should never checkpoint"
);
}
}
#[test]
fn test_planner_specific_layers() {
let planner =
GradCheckpointPlanner::new(8, CheckpointPolicy::SpecificLayers(vec![1, 3, 5]));
assert!(!planner.should_checkpoint_layer(0));
assert!(planner.should_checkpoint_layer(1));
assert!(!planner.should_checkpoint_layer(2));
assert!(planner.should_checkpoint_layer(3));
assert!(!planner.should_checkpoint_layer(4));
assert!(planner.should_checkpoint_layer(5));
assert!(!planner.should_checkpoint_layer(6));
assert!(!planner.should_checkpoint_layer(7));
}
#[test]
fn test_planner_memory_threshold() {
let sizes = vec![500, 1500, 800, 2000, 100];
let planner = planner_with_sizes(sizes, CheckpointPolicy::MemoryThreshold(1000));
assert!(!planner.should_checkpoint_layer(0));
assert!(planner.should_checkpoint_layer(1));
assert!(!planner.should_checkpoint_layer(2));
assert!(planner.should_checkpoint_layer(3));
assert!(!planner.should_checkpoint_layer(4));
}
#[test]
fn test_planner_total_memory_saved() {
let sizes = vec![100_usize, 200, 300, 400];
let planner = planner_with_sizes(sizes, CheckpointPolicy::EveryN(2));
assert_eq!(planner.total_memory_saved(), 400);
}
#[test]
fn test_planner_recompute_cost() {
let sizes = vec![100_usize; 6];
let planner = planner_with_sizes(sizes, CheckpointPolicy::EveryN(2));
assert_eq!(planner.recompute_cost(), 3);
}
#[test]
fn test_compute_optimal_empty_returns_empty() {
let result = GradCheckpointPlanner::compute_optimal_checkpoints(&[], 1024)
.expect("empty should succeed");
assert!(result.is_empty());
}
#[test]
fn test_compute_optimal_large_budget_no_checkpoints() {
let sizes = vec![100_usize, 200, 300, 400];
let result = GradCheckpointPlanner::compute_optimal_checkpoints(&sizes, 10_000)
.expect("should succeed");
assert!(
result.is_empty(),
"large budget should require no checkpoints"
);
}
#[test]
fn test_compute_optimal_tight_budget_checkpoints_some() {
let sizes = vec![200_usize, 300, 200, 300];
let result = GradCheckpointPlanner::compute_optimal_checkpoints(&sizes, 500)
.expect("should succeed");
let result2 = GradCheckpointPlanner::compute_optimal_checkpoints(&sizes, 300)
.expect("should succeed");
assert!(
!result2.is_empty(),
"tight budget should produce checkpoints"
);
let _ = result;
}
#[test]
fn test_compute_optimal_budget_too_small_error() {
let sizes = vec![100_usize, 500, 200];
let result = GradCheckpointPlanner::compute_optimal_checkpoints(&sizes, 400);
assert!(
matches!(result, Err(GradCheckpointError::BudgetTooSmall { .. })),
"should return BudgetTooSmall"
);
}
#[test]
fn test_activation_buffer_push_increases_current_size() {
let mut buf = ActivationBuffer::new(10_000);
buf.push_layer(0, 1024, false).expect("push ok");
assert_eq!(buf.current_size, 1024);
buf.push_layer(1, 512, false).expect("push ok");
assert_eq!(buf.current_size, 1536);
}
#[test]
fn test_activation_buffer_checkpointed_layer_no_size() {
let mut buf = ActivationBuffer::new(10_000);
buf.push_layer(0, 2048, true).expect("push ok");
assert_eq!(
buf.current_size, 0,
"checkpointed layer should not add to current_size"
);
}
#[test]
fn test_activation_buffer_evict_decreases_size() {
let mut buf = ActivationBuffer::new(10_000);
buf.push_layer(0, 1000, false).expect("push ok");
buf.push_layer(1, 2000, false).expect("push ok");
buf.evict_layer(0).expect("evict ok");
assert_eq!(buf.current_size, 2000);
}
#[test]
fn test_activation_buffer_evict_nonexistent_returns_err() {
let mut buf = ActivationBuffer::new(10_000);
let result = buf.evict_layer(42);
assert!(
matches!(result, Err(GradCheckpointError::LayerIndexOutOfBounds(42))),
"evicting non-existent layer should return error"
);
}
#[test]
fn test_activation_buffer_memory_pressure() {
let mut buf = ActivationBuffer::new(4000);
buf.push_layer(0, 1000, false).expect("push ok");
buf.push_layer(1, 1000, false).expect("push ok");
let pressure = buf.memory_pressure();
assert!(
(pressure - 0.5).abs() < 1e-5,
"memory pressure should be 0.5: got {pressure}"
);
}
#[test]
fn test_activation_buffer_duplicate_layer_returns_err() {
let mut buf = ActivationBuffer::new(10_000);
buf.push_layer(5, 100, false).expect("first push ok");
let result = buf.push_layer(5, 200, false);
assert!(
matches!(result, Err(GradCheckpointError::LayerIndexOutOfBounds(5))),
"duplicate layer_idx should return error"
);
}
#[test]
fn test_grad_checkpoint_error_display() {
let e1 = GradCheckpointError::EmptyLayers;
assert!(e1.to_string().contains("empty"));
let e2 = GradCheckpointError::InvalidPolicy("bad n".into());
assert!(e2.to_string().contains("bad n"));
let e3 = GradCheckpointError::LayerIndexOutOfBounds(7);
assert!(e3.to_string().contains('7'));
let e4 = GradCheckpointError::BudgetTooSmall {
budget: 100,
minimum: 500,
};
assert!(e4.to_string().contains("100"));
assert!(e4.to_string().contains("500"));
}
}