lcpfs 2026.1.102

LCP File System - A ZFS-inspired copy-on-write filesystem for Rust
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
// Copyright 2025 LunaOS Contributors
// SPDX-License-Identifier: Apache-2.0

//! # Adaptive Replacement Cache (ARC)
//!
//! This module implements the Adaptive Replacement Cache algorithm, a
//! self-tuning cache that balances recency and frequency of access.
//!
//! ## Overview
//!
//! ARC was developed at IBM Research and is used by ZFS for its block cache.
//! Unlike simple LRU caches, ARC maintains "ghost lists" that track recently
//! evicted items, allowing it to adapt to changing workload patterns.
//!
//! ## How It Works
//!
//! ARC maintains four lists:
//!
//! - **T1**: Recently accessed items (seen once) - recency
//! - **T2**: Frequently accessed items (seen multiple times) - frequency
//! - **B1**: Ghost list for T1 evictions (metadata only)
//! - **B2**: Ghost list for T2 evictions (metadata only)
//!
//! ```text
//! ┌─────────────────────────────────────────────────────────────┐
//! │                      ARC Cache                              │
//! ├──────────────────────┬──────────────────────────────────────┤
//! │        T1            │               T2                     │
//! │   (MRU - Recency)    │         (MFU - Frequency)            │
//! ├──────────────────────┼──────────────────────────────────────┤
//! │        B1            │               B2                     │
//! │   (Ghost - Recency)  │         (Ghost - Frequency)          │
//! └──────────────────────┴──────────────────────────────────────┘
//! ```
//!
//! ## Adaptation
//!
//! The parameter `p` controls the target size of T1 vs T2:
//!
//! - Cache miss in B1 → workload favors recency → increase `p`
//! - Cache miss in B2 → workload favors frequency → decrease `p`
//!
//! ## Usage
//!
//! ```rust,ignore
//! use lcpfs::lcpfs_arc::ARC;
//!
//! let mut arc = ARC.lock();
//!
//! // Try to read from cache
//! if let Some(data) = arc.read(&dva) {
//!     // Cache hit
//! } else {
//!     // Cache miss - read from disk
//!     let data = read_from_disk(&dva);
//!     arc.cache(dva, data);
//! }
//! ```
//!
//! ## Performance
//!
//! ARC provides:
//! - O(1) lookup via hash map
//! - Self-tuning between scan-resistant and frequency-based caching
//! - Efficient memory usage through ghost lists (metadata only)

use crate::fscore::structs::Dva;
use alloc::collections::{BTreeMap, VecDeque};
use alloc::vec::Vec;
use core::cmp;
use lazy_static::lazy_static;
use spin::Mutex;

// ═══════════════════════════════════════════════════════════════════════════════
// CONFIGURATION
// ═══════════════════════════════════════════════════════════════════════════════

/// Maximum cache size in bytes (100 GiB).
///
/// This is a soft limit - the cache will evict entries to stay under this size.
const ARC_MAX_BYTES: usize = 100 * 1024 * 1024 * 1024;

/// Number of shards for parallel cache access.
const NUM_SHARDS: usize = 16;

/// Maximum bytes per shard.
const SHARD_MAX_BYTES: usize = ARC_MAX_BYTES / NUM_SHARDS;

// ═══════════════════════════════════════════════════════════════════════════════
// ARC HEADER
// ═══════════════════════════════════════════════════════════════════════════════

/// Metadata header for a cached block.
///
/// Each cached block has an associated header that tracks access patterns
/// and cache state.
pub struct ArcHeader {
    /// The cached block data.
    pub data: Vec<u8>,
    /// Number of times this block has been accessed.
    pub access_count: u64,
    /// Size of the cached data in bytes.
    pub size: usize,
    /// Whether this block is deduplicated.
    pub is_dedup: bool,
    /// Which list this entry belongs to (1 = T1, 2 = T2).
    pub list_id: u8,
}

// ═══════════════════════════════════════════════════════════════════════════════
// ARC CACHE
// ═══════════════════════════════════════════════════════════════════════════════

/// Adaptive Replacement Cache implementation.
///
/// This cache automatically adapts to workload patterns by tracking both
/// recently used (recency) and frequently used (frequency) items.
pub struct Arc {
    /// Index mapping DVAs to cached headers.
    pub index: BTreeMap<Dva, ArcHeader>,

    /// T1: Most Recently Used list (single-access items).
    /// Front = LRU (evict from here), Back = MRU (add here)
    pub t1: VecDeque<Dva>,

    /// T2: Most Frequently Used list (multi-access items).
    /// Front = LRU (evict from here), Back = MRU (add here)
    pub t2: VecDeque<Dva>,

    /// B1: Ghost list for T1 (tracks recently evicted from T1).
    pub b1: VecDeque<Dva>,

    /// B2: Ghost list for T2 (tracks recently evicted from T2).
    pub b2: VecDeque<Dva>,

    /// Current cache size in bytes.
    pub current_size: usize,

    /// Total cache hits since initialization.
    pub hits: u64,

    /// Total cache misses since initialization.
    pub misses: u64,

    /// Target size for T1 (adaptive parameter).
    ///
    /// This value is automatically adjusted based on workload:
    /// - Increases when B1 hits occur (workload favors recency)
    /// - Decreases when B2 hits occur (workload favors frequency)
    pub p: usize,

    /// Maximum size for this shard in bytes.
    pub max_bytes: usize,
}

lazy_static! {
    /// Global ARC instance.
    ///
    /// All block cache operations go through this shared cache.
    pub static ref ARC: Mutex<Arc> = Mutex::new(Arc::new());

    /// Sharded ARC instances for parallel access.
    pub static ref SHARDED_ARC: [Mutex<Arc>; NUM_SHARDS] = [
        Mutex::new(Arc::new_with_max(SHARD_MAX_BYTES)),
        Mutex::new(Arc::new_with_max(SHARD_MAX_BYTES)),
        Mutex::new(Arc::new_with_max(SHARD_MAX_BYTES)),
        Mutex::new(Arc::new_with_max(SHARD_MAX_BYTES)),
        Mutex::new(Arc::new_with_max(SHARD_MAX_BYTES)),
        Mutex::new(Arc::new_with_max(SHARD_MAX_BYTES)),
        Mutex::new(Arc::new_with_max(SHARD_MAX_BYTES)),
        Mutex::new(Arc::new_with_max(SHARD_MAX_BYTES)),
        Mutex::new(Arc::new_with_max(SHARD_MAX_BYTES)),
        Mutex::new(Arc::new_with_max(SHARD_MAX_BYTES)),
        Mutex::new(Arc::new_with_max(SHARD_MAX_BYTES)),
        Mutex::new(Arc::new_with_max(SHARD_MAX_BYTES)),
        Mutex::new(Arc::new_with_max(SHARD_MAX_BYTES)),
        Mutex::new(Arc::new_with_max(SHARD_MAX_BYTES)),
        Mutex::new(Arc::new_with_max(SHARD_MAX_BYTES)),
        Mutex::new(Arc::new_with_max(SHARD_MAX_BYTES)),
    ];
}

// Sharded API functions

/// Calculate shard index for a DVA.
#[inline]
fn get_shard_index(dva: &Dva) -> usize {
    let hash = dva.vdev as usize ^ (dva.offset as usize).wrapping_mul(0x9e3779b9);
    hash % NUM_SHARDS
}

/// Read from the sharded cache.
pub fn arc_read(dva: &Dva) -> Option<Vec<u8>> {
    let shard_idx = get_shard_index(dva);
    SHARDED_ARC[shard_idx].lock().read(dva)
}

/// Write to the sharded cache.
pub fn arc_cache(dva: Dva, data: Vec<u8>) {
    let shard_idx = get_shard_index(&dva);
    SHARDED_ARC[shard_idx].lock().cache(dva, data);
}

/// Get aggregate statistics across all shards.
pub fn arc_stats() -> (u64, u64, f64, usize) {
    let mut total_hits = 0u64;
    let mut total_misses = 0u64;
    let mut total_size = 0usize;
    for shard in SHARDED_ARC.iter() {
        let (hits, misses, _, size) = shard.lock().stats();
        total_hits += hits;
        total_misses += misses;
        total_size += size;
    }
    let total = total_hits + total_misses;
    let hit_rate = if total > 0 {
        total_hits as f64 / total as f64
    } else {
        0.0
    };
    (total_hits, total_misses, hit_rate, total_size)
}

impl Arc {
    /// Create a new, empty ARC cache with default max size.
    pub fn new() -> Self {
        Self::new_with_max(ARC_MAX_BYTES)
    }

    /// Create a new, empty ARC cache with specified max size.
    pub fn new_with_max(max_bytes: usize) -> Self {
        Self {
            index: BTreeMap::new(),
            t1: VecDeque::new(),
            t2: VecDeque::new(),
            b1: VecDeque::new(),
            b2: VecDeque::new(),
            current_size: 0,
            hits: 0,
            misses: 0,
            p: 0,
            max_bytes,
        }
    }

    /// Read a block from the cache.
    ///
    /// # Arguments
    ///
    /// * `dva` - Device Virtual Address of the block to read
    ///
    /// # Returns
    ///
    /// - `Some(data)` - The cached block data (cache hit)
    /// - `None` - Block not in cache (cache miss)
    ///
    /// # Side Effects
    ///
    /// - On hit: Increments access count, may promote from T1 to T2
    /// - On miss: Checks ghost lists and adapts `p` parameter
    pub fn read(&mut self, dva: &Dva) -> Option<Vec<u8>> {
        if let Some(header) = self.index.get_mut(dva) {
            header.access_count += 1;
            self.hits += 1;

            // Capture state before mutable borrow
            let list_id = header.list_id;
            let t1_pos = if list_id == 1 {
                self.t1.iter().position(|x| x == dva)
            } else {
                None
            };
            let t2_pos = if list_id == 2 {
                self.t2.iter().position(|x| x == dva)
            } else {
                None
            };

            // Promote from T1 to T2 on second access
            if list_id == 1 {
                if let Some(pos) = t1_pos {
                    self.t1.remove(pos);
                    self.t2.push_back(*dva); // Add to MRU end
                    header.list_id = 2;
                }
            } else if list_id == 2 {
                if let Some(pos) = t2_pos {
                    // Move to MRU end of T2
                    self.t2.remove(pos);
                    self.t2.push_back(*dva);
                }
            }

            Some(header.data.clone())
        } else {
            // Cache miss - check ghost lists for adaptation
            if self.b1.contains(dva) || self.b2.contains(dva) {
                self.adapt_on_miss(dva);
            }
            self.misses += 1;
            None
        }
    }

    /// Add a block to the cache.
    ///
    /// New blocks are added to T1 (recency list). If the block is accessed
    /// again, it will be promoted to T2 (frequency list).
    ///
    /// # Arguments
    ///
    /// * `dva` - Device Virtual Address of the block
    /// * `data` - Block data to cache
    ///
    /// # Side Effects
    ///
    /// May evict other blocks to make room if the cache is full.
    pub fn cache(&mut self, dva: Dva, data: Vec<u8>) {
        let size = data.len();

        // Evict entries until we have room
        while self.current_size + size > self.max_bytes {
            self.evict_one();
        }

        let header = ArcHeader {
            data,
            access_count: 1,
            size,
            is_dedup: false,
            list_id: 1, // New entries start in T1
        };

        if self.index.insert(dva, header).is_none() {
            self.current_size += size;
            self.t1.push_back(dva); // Add to MRU end
        }
    }

    /// Evict one entry from the cache.
    ///
    /// Eviction policy:
    /// 1. If T1 is larger than target (p) or B1 >= B2: evict from T1 → B1
    /// 2. Otherwise: evict from T2 → B2
    ///
    /// Evicts from the FRONT (LRU end) of the lists.
    fn evict_one(&mut self) {
        let t1_size = self.t1.len();
        let b1_size = self.b1.len();
        let b2_size = self.b2.len();

        let to_evict = if t1_size > 0 && (t1_size > self.p || b1_size >= b2_size) {
            // Evict from T1 (recency) → B1
            // CRITICAL: pop_front() evicts LRU (least recently used)
            if let Some(dva) = self.t1.pop_front() {
                self.b1.push_back(dva); // Add to ghost list
                Some(dva)
            } else {
                None
            }
        } else if !self.t2.is_empty() {
            // Evict from T2 (frequency) → B2
            // CRITICAL: pop_front() evicts LRU (least recently used)
            if let Some(dva) = self.t2.pop_front() {
                self.b2.push_back(dva); // Add to ghost list
                Some(dva)
            } else {
                None
            }
        } else {
            None
        };

        if let Some(dva) = to_evict {
            if let Some(header) = self.index.remove(&dva) {
                self.current_size -= header.size;
            }
        }

        // Trim ghost lists to prevent unbounded growth
        // Maximum ghost entries = cache size / minimum block size
        // Evict from FRONT (oldest ghosts)
        while self.b1.len() + self.b2.len() > self.max_bytes / 4096 {
            if !self.b1.is_empty() {
                self.b1.pop_front(); // Remove oldest ghost
            } else if !self.b2.is_empty() {
                self.b2.pop_front(); // Remove oldest ghost
            }
        }
    }

    /// Adapt the cache parameters based on ghost list hits.
    ///
    /// When a cache miss occurs for a DVA that's in a ghost list,
    /// we adjust `p` to favor the corresponding access pattern:
    ///
    /// - Miss in B1: Increase `p` (favor recency)
    /// - Miss in B2: Decrease `p` (favor frequency)
    fn adapt_on_miss(&mut self, dva: &Dva) {
        let b1_size = self.b1.len();
        let b2_size = self.b2.len();

        if self.b1.contains(dva) {
            // Ghost hit in B1: workload favors recency
            // Increase p to give more space to T1
            self.p = self.p.saturating_add(b2_size / cmp::max(b1_size, 1));
            self.p = self.p.clamp(0, self.max_bytes / 4096);
        } else if self.b2.contains(dva) {
            // Ghost hit in B2: workload favors frequency
            // Decrease p to give more space to T2
            self.p = self.p.saturating_sub(b1_size / cmp::max(b2_size, 1));
            self.p = self.p.clamp(0, self.max_bytes / 4096);
        }
    }

    /// Get cache statistics.
    ///
    /// # Returns
    ///
    /// Tuple of (hits, misses, hit_rate, current_size_bytes)
    pub fn stats(&self) -> (u64, u64, f64, usize) {
        let total = self.hits + self.misses;
        let hit_rate = if total > 0 {
            self.hits as f64 / total as f64
        } else {
            0.0
        };
        (self.hits, self.misses, hit_rate, self.current_size)
    }
}

impl Default for Arc {
    fn default() -> Self {
        Self::new()
    }
}