oxillama_runtime/offload/
mod.rs

1//! CPU/disk offload with a pinned hot-layer set.
2//!
3//! This module provides the infrastructure for offloading model weights to disk
4//! and loading them on-demand with an LRU eviction policy. A pinned hot-set
5//! keeps embeddings, the output head, and the last N attention layers always
6//! resident in RAM.
7//!
8//! # Overview
9//!
10//! - [`OffloadPolicy`] — declarative configuration: none, budget, or pinned hot-set.
11//! - [`LayerPager`] — LRU weight pager with eviction, pinned tensors, and on-demand
12//!   loads from a [`PagerSource`].
13//! - [`MemoryPressureProbe`] — lightweight OS-level pressure monitor (Linux / macOS).
14
15pub mod pager;
16pub mod policy;
17pub mod pressure;
18
19pub use pager::{FilePagerSource, LayerPager, PagerSource, ResidentTensor, TensorEntry, TensorId};
20pub use policy::OffloadPolicy;
21pub use pressure::MemoryPressureProbe;
oxillama_runtime/offload/mod.rs

oxillama_runtime/offload/
mod.rs