use alloc::collections::BTreeMap;
use alloc::vec::Vec;
use lazy_static::lazy_static;
use spin::Mutex;
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum PmemTier {
Dram,
OptanePmem,
NvmeSsd,
Hdd,
}
impl PmemTier {
pub fn name(&self) -> &'static str {
match self {
PmemTier::Dram => "DRAM",
PmemTier::OptanePmem => "Optane",
PmemTier::NvmeSsd => "NVMe",
PmemTier::Hdd => "HDD",
}
}
pub fn perf_multiplier(&self) -> f32 {
match self {
PmemTier::Dram => 1.0, PmemTier::OptanePmem => 0.25, PmemTier::NvmeSsd => 0.07, PmemTier::Hdd => 0.002, }
}
pub fn latency_ns(&self) -> u64 {
match self {
PmemTier::Dram => 100, PmemTier::OptanePmem => 350, PmemTier::NvmeSsd => 10_000, PmemTier::Hdd => 10_000_000, }
}
}
#[derive(Debug, Clone)]
pub struct PmemDevice {
pub id: u64,
pub tier: PmemTier,
pub capacity: u64,
pub used: u64,
pub phys_addr: u64,
pub dax_enabled: bool,
pub numa_node: u32,
}
impl PmemDevice {
pub fn new(id: u64, tier: PmemTier, capacity: u64, phys_addr: u64) -> Self {
Self {
id,
tier,
capacity,
used: 0,
phys_addr,
dax_enabled: tier == PmemTier::OptanePmem, numa_node: 0,
}
}
pub fn free_space(&self) -> u64 {
self.capacity.saturating_sub(self.used)
}
pub fn can_allocate(&self, size: u64) -> bool {
self.free_space() >= size
}
pub fn allocate(&mut self, size: u64) -> Result<u64, &'static str> {
if !self.can_allocate(size) {
return Err("Insufficient space");
}
let offset = self.used;
self.used += size;
Ok(offset)
}
pub fn free(&mut self, size: u64) {
self.used = self.used.saturating_sub(size);
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DataTemperature {
VeryHot,
Hot,
Warm,
Cold,
Frozen,
}
impl DataTemperature {
pub fn from_last_access(now_ms: u64, last_access_ms: u64) -> Self {
let age_ms = now_ms.saturating_sub(last_access_ms);
if age_ms < 60_000 {
DataTemperature::VeryHot
} else if age_ms < 3_600_000 {
DataTemperature::Hot
} else if age_ms < 86_400_000 {
DataTemperature::Warm
} else if age_ms < 604_800_000 {
DataTemperature::Cold
} else {
DataTemperature::Frozen
}
}
pub fn optimal_tier(&self) -> PmemTier {
match self {
DataTemperature::VeryHot => PmemTier::Dram,
DataTemperature::Hot => PmemTier::OptanePmem,
DataTemperature::Warm => PmemTier::NvmeSsd,
DataTemperature::Cold => PmemTier::NvmeSsd,
DataTemperature::Frozen => PmemTier::Hdd,
}
}
}
#[derive(Debug, Clone)]
pub struct BlockPlacement {
pub block_offset: u64,
pub tier: PmemTier,
pub device_id: u64,
pub device_offset: u64,
pub last_access: u64,
pub access_count: u64,
}
#[derive(Debug, Clone, Default)]
pub struct PmemStats {
pub total_allocations: u64,
pub promotions: u64,
pub demotions: u64,
pub dax_operations: u64,
pub promotion_bytes: u64,
pub demotion_bytes: u64,
}
lazy_static! {
static ref PMEM_MANAGER: Mutex<PmemManager> = Mutex::new(PmemManager::new());
}
pub struct PmemManager {
devices: BTreeMap<PmemTier, Vec<PmemDevice>>,
placements: BTreeMap<u64, BlockPlacement>,
stats: PmemStats,
auto_tiering: bool,
}
impl Default for PmemManager {
fn default() -> Self {
Self::new()
}
}
impl PmemManager {
pub fn new() -> Self {
Self {
devices: BTreeMap::new(),
placements: BTreeMap::new(),
stats: PmemStats::default(),
auto_tiering: true,
}
}
pub fn register_device(&mut self, device: PmemDevice) {
crate::lcpfs_println!(
"[ PMEM ] Registered {} device #{} ({} GB, DAX: {})",
device.tier.name(),
device.id,
device.capacity / 1024 / 1024 / 1024,
device.dax_enabled
);
self.devices.entry(device.tier).or_default().push(device);
}
pub fn allocate(
&mut self,
block_offset: u64,
size: u64,
tier: PmemTier,
) -> Result<BlockPlacement, &'static str> {
let devices = self.devices.get_mut(&tier).ok_or("Tier not available")?;
for device in devices.iter_mut() {
if device.can_allocate(size) {
let device_offset = device.allocate(size)?;
let placement = BlockPlacement {
block_offset,
tier,
device_id: device.id,
device_offset,
last_access: 0,
access_count: 0,
};
self.placements.insert(block_offset, placement.clone());
self.stats.total_allocations += 1;
return Ok(placement);
}
}
Err("No space on tier")
}
pub fn allocate_auto(
&mut self,
block_offset: u64,
size: u64,
temperature: DataTemperature,
) -> Result<BlockPlacement, &'static str> {
let optimal_tier = temperature.optimal_tier();
if let Ok(placement) = self.allocate(block_offset, size, optimal_tier) {
return Ok(placement);
}
let tiers = [
PmemTier::Dram,
PmemTier::OptanePmem,
PmemTier::NvmeSsd,
PmemTier::Hdd,
];
for tier in &tiers {
if let Ok(placement) = self.allocate(block_offset, size, *tier) {
return Ok(placement);
}
}
Err("No space on any tier")
}
pub fn record_access(&mut self, block_offset: u64, timestamp: u64) {
if let Some(placement) = self.placements.get_mut(&block_offset) {
placement.last_access = timestamp;
placement.access_count += 1;
}
}
pub fn promote(&mut self, block_offset: u64, size: u64) -> Result<(), &'static str> {
let (old_tier, old_device_id) = {
let placement = self
.placements
.get(&block_offset)
.ok_or("Block not found")?;
(placement.tier, placement.device_id)
};
let new_tier = match old_tier {
PmemTier::Hdd => PmemTier::NvmeSsd,
PmemTier::NvmeSsd => PmemTier::OptanePmem,
PmemTier::OptanePmem => PmemTier::Dram,
PmemTier::Dram => return Ok(()), };
let new_placement = self.allocate(block_offset, size, new_tier)?;
if let Some(devices) = self.devices.get_mut(&old_tier) {
for device in devices.iter_mut() {
if device.id == old_device_id {
device.free(size);
break;
}
}
}
self.placements.insert(block_offset, new_placement);
self.stats.promotions += 1;
self.stats.promotion_bytes += size;
crate::lcpfs_println!(
"[ PMEM ] Promoted block 0x{:x} from {} to {}",
block_offset,
old_tier.name(),
new_tier.name()
);
Ok(())
}
pub fn demote(&mut self, block_offset: u64, size: u64) -> Result<(), &'static str> {
let (old_tier, old_device_id) = {
let placement = self
.placements
.get(&block_offset)
.ok_or("Block not found")?;
(placement.tier, placement.device_id)
};
let new_tier = match old_tier {
PmemTier::Dram => PmemTier::OptanePmem,
PmemTier::OptanePmem => PmemTier::NvmeSsd,
PmemTier::NvmeSsd => PmemTier::Hdd,
PmemTier::Hdd => return Ok(()), };
let new_placement = self.allocate(block_offset, size, new_tier)?;
if let Some(devices) = self.devices.get_mut(&old_tier) {
for device in devices.iter_mut() {
if device.id == old_device_id {
device.free(size);
break;
}
}
}
self.placements.insert(block_offset, new_placement);
self.stats.demotions += 1;
self.stats.demotion_bytes += size;
Ok(())
}
pub fn auto_tier(&mut self, current_time: u64, block_size: u64) -> u64 {
if !self.auto_tiering {
return 0;
}
let mut moves = 0;
let blocks: Vec<u64> = self.placements.keys().copied().collect();
for block_offset in blocks {
if let Some(placement) = self.placements.get(&block_offset) {
let temp = DataTemperature::from_last_access(current_time, placement.last_access);
let optimal = temp.optimal_tier();
if optimal > placement.tier {
let _ = self.demote(block_offset, block_size);
moves += 1;
} else if optimal < placement.tier {
let _ = self.promote(block_offset, block_size);
moves += 1;
}
}
}
moves
}
pub fn stats(&self) -> PmemStats {
self.stats.clone()
}
pub fn device_count(&self, tier: PmemTier) -> usize {
self.devices.get(&tier).map(|v| v.len()).unwrap_or(0)
}
pub fn tier_capacity(&self, tier: PmemTier) -> u64 {
self.devices
.get(&tier)
.map(|devices| devices.iter().map(|d| d.capacity).sum())
.unwrap_or(0)
}
}
pub struct Pmem;
impl Pmem {
pub fn register_device(device: PmemDevice) {
let mut mgr = PMEM_MANAGER.lock();
mgr.register_device(device);
}
pub fn allocate(
block_offset: u64,
size: u64,
temperature: DataTemperature,
) -> Result<BlockPlacement, &'static str> {
let mut mgr = PMEM_MANAGER.lock();
mgr.allocate_auto(block_offset, size, temperature)
}
pub fn record_access(block_offset: u64, timestamp: u64) {
let mut mgr = PMEM_MANAGER.lock();
mgr.record_access(block_offset, timestamp);
}
pub fn auto_tier(current_time: u64, block_size: u64) -> u64 {
let mut mgr = PMEM_MANAGER.lock();
mgr.auto_tier(current_time, block_size)
}
pub fn stats() -> PmemStats {
let mgr = PMEM_MANAGER.lock();
mgr.stats()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_tier_performance() {
assert!(PmemTier::Dram.perf_multiplier() > PmemTier::OptanePmem.perf_multiplier());
assert!(PmemTier::OptanePmem.latency_ns() > PmemTier::Dram.latency_ns());
assert!(PmemTier::Hdd.latency_ns() > PmemTier::NvmeSsd.latency_ns());
}
#[test]
fn test_temperature_classification() {
let now = 1_000_000_000;
let very_hot = DataTemperature::from_last_access(now, now - 30_000);
assert_eq!(very_hot, DataTemperature::VeryHot);
assert_eq!(very_hot.optimal_tier(), PmemTier::Dram);
let frozen = DataTemperature::from_last_access(now, now - 700_000_000); assert_eq!(frozen, DataTemperature::Frozen);
assert_eq!(frozen.optimal_tier(), PmemTier::Hdd);
}
#[test]
fn test_device_allocation() {
let mut device = PmemDevice::new(1, PmemTier::OptanePmem, 1_000_000, 0);
assert_eq!(device.free_space(), 1_000_000);
assert!(device.can_allocate(500_000));
let offset = device
.allocate(500_000)
.expect("test: operation should succeed");
assert_eq!(offset, 0);
assert_eq!(device.free_space(), 500_000);
device.free(200_000);
assert_eq!(device.free_space(), 700_000);
}
#[test]
fn test_manager_basic() {
let mut mgr = PmemManager::new();
let dram = PmemDevice::new(1, PmemTier::Dram, 16_000_000_000, 0);
let optane = PmemDevice::new(2, PmemTier::OptanePmem, 128_000_000_000, 0x1000000000);
mgr.register_device(dram);
mgr.register_device(optane);
assert_eq!(mgr.device_count(PmemTier::Dram), 1);
assert_eq!(mgr.device_count(PmemTier::OptanePmem), 1);
}
#[test]
fn test_auto_allocation() {
let mut mgr = PmemManager::new();
mgr.register_device(PmemDevice::new(1, PmemTier::Dram, 1_000_000, 0));
mgr.register_device(PmemDevice::new(2, PmemTier::OptanePmem, 10_000_000, 0));
let placement = mgr
.allocate_auto(0x1000, 4096, DataTemperature::VeryHot)
.expect("test: operation should succeed");
assert_eq!(placement.tier, PmemTier::Dram);
let placement2 = mgr
.allocate_auto(0x2000, 4096, DataTemperature::Hot)
.expect("test: operation should succeed");
assert_eq!(placement2.tier, PmemTier::OptanePmem);
}
#[test]
fn test_promotion_demotion() {
let mut mgr = PmemManager::new();
mgr.register_device(PmemDevice::new(1, PmemTier::Dram, 10_000_000, 0));
mgr.register_device(PmemDevice::new(2, PmemTier::OptanePmem, 100_000_000, 0));
mgr.register_device(PmemDevice::new(3, PmemTier::NvmeSsd, 1_000_000_000, 0));
mgr.allocate(0x1000, 4096, PmemTier::NvmeSsd)
.expect("test: operation should succeed");
mgr.promote(0x1000, 4096)
.expect("test: operation should succeed");
let placement = mgr
.placements
.get(&0x1000)
.expect("test: operation should succeed");
assert_eq!(placement.tier, PmemTier::OptanePmem);
mgr.demote(0x1000, 4096)
.expect("test: operation should succeed");
let placement = mgr
.placements
.get(&0x1000)
.expect("test: operation should succeed");
assert_eq!(placement.tier, PmemTier::NvmeSsd);
assert!(mgr.stats.promotions > 0);
assert!(mgr.stats.demotions > 0);
}
#[test]
fn test_auto_tiering() {
let mut mgr = PmemManager::new();
mgr.register_device(PmemDevice::new(1, PmemTier::Dram, 100_000_000, 0));
mgr.register_device(PmemDevice::new(2, PmemTier::Hdd, 1_000_000_000, 0));
mgr.allocate(0x1000, 4096, PmemTier::Dram)
.expect("test: operation should succeed");
mgr.record_access(0x1000, 0);
let current_time = 30 * 24 * 3600 * 1000; let moves = mgr.auto_tier(current_time, 4096);
assert!(moves > 0); }
#[test]
fn test_dax_support() {
let dram = PmemDevice::new(1, PmemTier::Dram, 1_000_000, 0);
let optane = PmemDevice::new(2, PmemTier::OptanePmem, 1_000_000, 0x1000000);
assert!(!dram.dax_enabled); assert!(optane.dax_enabled); }
#[test]
fn test_tier_capacity() {
let mut mgr = PmemManager::new();
mgr.register_device(PmemDevice::new(1, PmemTier::Dram, 16_000_000_000, 0));
mgr.register_device(PmemDevice::new(2, PmemTier::Dram, 16_000_000_000, 0));
assert_eq!(mgr.tier_capacity(PmemTier::Dram), 32_000_000_000);
}
}