lcpfs 2026.1.102

LCP File System - A ZFS-inspired copy-on-write filesystem for Rust
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
// Copyright 2025 LunaOS Contributors
// SPDX-License-Identifier: Apache-2.0
//
// Dynamic Partitioning
// PI-controlled resource allocation.

// Dynamically adjusts partition boundaries to minimize epsilon.
// ALL thresholds are learned from observation - NO hardcoded values.
// ============================================================================

use alloc::collections::BTreeMap;
use alloc::vec::Vec;
use lazy_static::lazy_static;
use libm::{fabs, sqrt};
use spin::Mutex;

// ═══════════════════════════════════════════════════════════════════════════
// LEARNED THRESHOLDS (Welford's algorithm - no hardcoded values)
// ═══════════════════════════════════════════════════════════════════════════

/// A threshold value learned from observations using Welford's algorithm.
/// Adapts based on outcomes to optimize system performance without hardcoded values.
#[derive(Clone, Copy)]
pub struct LearnedThreshold {
    /// Current threshold value
    pub value: f64,
    /// Uncertainty in the threshold value (standard error)
    pub uncertainty: f64,
    /// Number of observations recorded
    pub observations: u64,
    /// Current learning rate (decreases as observations increase)
    pub learning_rate: f64,
    /// Mean of observed outcomes
    pub mean_outcome: f64,
    /// Variance of observed outcomes
    pub variance: f64,
}

impl LearnedThreshold {
    /// Creates a new uninformed threshold with maximum uncertainty.
    /// The initial guess serves as a starting point for learning.
    pub const fn uninformed(initial_guess: f64) -> Self {
        Self {
            value: initial_guess,
            uncertainty: f64::MAX,
            observations: 0,
            learning_rate: 1.0,
            mean_outcome: 0.0,
            variance: f64::MAX,
        }
    }

    /// Records an observation and updates the threshold using Welford's online algorithm.
    /// Adjusts threshold based on whether the outcome improved (reduced) epsilon.
    pub fn observe(&mut self, action_value: f64, outcome_delta_epsilon: f64) {
        self.observations += 1;
        let n = self.observations as f64;

        let delta = outcome_delta_epsilon - self.mean_outcome;
        self.mean_outcome += delta / n;
        let delta2 = outcome_delta_epsilon - self.mean_outcome;

        if self.observations > 1 {
            let m2 = self.variance * (n - 2.0) + delta * delta2;
            self.variance = m2 / (n - 1.0);
            self.uncertainty = sqrt(self.variance / n);
        }

        let adjustment = if outcome_delta_epsilon < 0.0 {
            (action_value - self.value) * self.learning_rate
        } else {
            (self.value - action_value) * self.learning_rate * 0.5
        };

        self.value += adjustment;
        self.learning_rate = 1.0 / (1.0 + sqrt(self.observations as f64) * 0.1);
    }

    /// Returns confidence level (0.0 to 1.0) in the threshold value.
    /// Higher values indicate more observations and lower uncertainty.
    pub fn confidence(&self) -> f64 {
        if self.observations == 0 {
            return 0.0;
        }
        let obs_factor = 1.0 - 1.0 / (1.0 + self.observations as f64 * 0.01);
        let unc_factor = 1.0 / (1.0 + fabs(self.uncertainty));
        obs_factor * unc_factor
    }

    /// Determines if action should be taken based on current value and estimated benefit.
    /// Returns true if current value exceeds threshold and benefit outweighs uncertainty.
    pub fn should_act(&self, current_value: f64, estimated_benefit: f64) -> bool {
        let benefit_over_uncertainty = estimated_benefit / (self.uncertainty + 1e-10);
        current_value >= self.value && benefit_over_uncertainty > 1.0
    }
}

// ═══════════════════════════════════════════════════════════════════════════
// PARTITION DEFINITION
// ═══════════════════════════════════════════════════════════════════════════

/// A dynamic partition within the pool
#[derive(Clone, Debug)]
pub struct Partition {
    /// Unique partition identifier
    pub id: u64,
    /// Human-readable partition name
    pub name: alloc::string::String,
    /// Current size in blocks
    pub size_blocks: u64,
    /// Minimum size (cannot shrink below this)
    pub min_size_blocks: u64,
    /// Maximum size (cannot grow above this)
    pub max_size_blocks: u64,
    /// Access frequency (accesses per second)
    pub access_frequency: f64,
    /// Average I/O latency (microseconds)
    pub avg_latency_us: f64,
    /// Space utilization (0.0 to 1.0)
    pub utilization: f64,
    /// Last resize timestamp
    pub last_resize_ms: u64,
}

impl Partition {
    /// Calculate partition pressure (higher = needs more space)
    pub fn pressure(&self) -> f64 {
        // Pressure = (utilization * access_frequency) / available_capacity
        let available_capacity =
            (self.max_size_blocks - self.size_blocks) as f64 / self.max_size_blocks as f64;
        if available_capacity < 0.01 {
            return f64::MAX;
        }
        (self.utilization * self.access_frequency) / available_capacity
    }

    /// Calculate epsilon contribution (higher = worse performance)
    pub fn epsilon_contribution(&self) -> f64 {
        // ε_partition = latency * utilization * access_frequency
        // High latency + high utilization + high access = bad
        self.avg_latency_us * self.utilization * self.access_frequency
    }
}

/// Outcome of a repartition for learning
#[derive(Clone, Copy)]
pub struct RepartitionOutcome {
    /// ID of the partition that was resized
    pub partition_id: u64,
    /// Partition size before repartition
    pub old_size_blocks: u64,
    /// Partition size after repartition
    pub new_size_blocks: u64,
    /// Partition pressure when decision was made
    pub pressure_at_decision: f64,
    /// System epsilon before repartition
    pub epsilon_before: f64,
    /// System epsilon after repartition
    pub epsilon_after: f64,
    /// Time taken to complete repartition
    pub time_taken_ms: u64,
}

impl RepartitionOutcome {
    /// Calculates change in epsilon (positive = worse, negative = better).
    pub fn delta_epsilon(&self) -> f64 {
        self.epsilon_after - self.epsilon_before
    }

    /// Returns true if repartition reduced epsilon (improved performance).
    pub fn was_beneficial(&self) -> bool {
        self.delta_epsilon() < 0.0
    }
}

// ═══════════════════════════════════════════════════════════════════════════
// PARTITION ENGINE
// ═══════════════════════════════════════════════════════════════════════════

lazy_static! {
    /// Global partition engine managing all dynamic pool partitions
    pub static ref PARTITION_ENGINE: Mutex<PartitionEngine> = Mutex::new(PartitionEngine::new());
}

/// Engine for managing dynamic partitions with PI-driven adaptive resizing
pub struct PartitionEngine {
    /// Map of all active partitions by ID
    pub partitions: BTreeMap<u64, Partition>,
    /// Total number of blocks in the pool
    pub total_pool_blocks: u64,
    /// Whether a repartition is currently in progress
    pub is_repartitioning: bool,

    // Observation history for learning
    outcomes: alloc::collections::VecDeque<RepartitionOutcome>,

    // ═══════════════════════════════════════════════════════════════════════
    // LEARNED THRESHOLDS (no hardcoded values)
    // ═══════════════════════════════════════════════════════════════════════
    /// Learned: Pressure threshold to trigger resize
    threshold_pressure: LearnedThreshold,

    /// Learned: Minimum time between repartitions (ms)
    threshold_cooldown: LearnedThreshold,

    /// Learned: Growth increment (% of current size)
    growth_increment: LearnedThreshold,

    /// Learned: Shrink increment (% of current size)
    shrink_increment: LearnedThreshold,

    /// Current system epsilon
    current_epsilon: f64,
}

impl Default for PartitionEngine {
    fn default() -> Self {
        Self::new()
    }
}

impl PartitionEngine {
    /// Creates a new partition engine with uninformed learned thresholds.
    pub fn new() -> Self {
        Self {
            partitions: BTreeMap::new(),
            total_pool_blocks: 0,
            is_repartitioning: false,
            outcomes: alloc::collections::VecDeque::with_capacity(100),

            // Initialize with uninformed priors
            threshold_pressure: LearnedThreshold::uninformed(100.0), // Pressure threshold
            threshold_cooldown: LearnedThreshold::uninformed(300_000.0), // 5 minutes
            growth_increment: LearnedThreshold::uninformed(0.1),     // 10% growth
            shrink_increment: LearnedThreshold::uninformed(0.05),    // 5% shrink

            current_epsilon: 0.0,
        }
    }

    /// Update current system epsilon
    pub fn update_epsilon(&mut self, epsilon: f64) {
        self.current_epsilon = epsilon;
    }

    /// Create a new partition
    pub fn create_partition(
        &mut self,
        name: &str,
        size_blocks: u64,
        min_size_blocks: u64,
        max_size_blocks: u64,
    ) -> Result<u64, &'static str> {
        let id = self.partitions.len() as u64;

        let partition = Partition {
            id,
            name: alloc::string::String::from(name),
            size_blocks,
            min_size_blocks,
            max_size_blocks,
            access_frequency: 0.0,
            avg_latency_us: 0.0,
            utilization: 0.0,
            last_resize_ms: 0,
        };

        self.partitions.insert(id, partition);
        crate::lcpfs_println!("[ PARTITION] Created partition '{}' ({})", name, id);

        Ok(id)
    }

    /// Update partition statistics
    pub fn update_stats(
        &mut self,
        partition_id: u64,
        access_frequency: f64,
        avg_latency_us: f64,
        utilization: f64,
    ) {
        if let Some(partition) = self.partitions.get_mut(&partition_id) {
            partition.access_frequency = access_frequency;
            partition.avg_latency_us = avg_latency_us;
            partition.utilization = utilization;
        }
    }

    /// PI decides whether to repartition
    pub fn should_repartition(&self, partition_id: u64, current_time_ms: u64) -> bool {
        if self.is_repartitioning {
            return false;
        }

        let partition = match self.partitions.get(&partition_id) {
            Some(p) => p,
            None => return false,
        };

        // Check cooldown
        let time_since_last = current_time_ms.saturating_sub(partition.last_resize_ms) as f64;
        if time_since_last < self.threshold_cooldown.value {
            return false;
        }

        let pressure = partition.pressure();
        let benefit = self.estimate_repartition_benefit(partition_id, pressure);

        self.threshold_pressure.should_act(pressure, benefit)
            && self.threshold_pressure.confidence() > 0.1
    }

    /// Estimate epsilon reduction from repartitioning
    fn estimate_repartition_benefit(&self, partition_id: u64, current_pressure: f64) -> f64 {
        // Look at past repartitions with similar pressure
        let similar_outcomes: Vec<_> = self
            .outcomes
            .iter()
            .filter(|o| o.partition_id == partition_id)
            .filter(|o| fabs(o.pressure_at_decision - current_pressure) < current_pressure * 0.3)
            .collect();

        if similar_outcomes.is_empty() {
            // No prior data - estimate based on pressure
            return current_pressure * 0.1;
        }

        // Average epsilon reduction from similar repartitions
        let beneficial: Vec<_> = similar_outcomes
            .iter()
            .filter(|o| o.was_beneficial())
            .collect();

        if beneficial.is_empty() {
            return 0.0;
        }

        let avg_benefit: f64 =
            beneficial.iter().map(|o| -o.delta_epsilon()).sum::<f64>() / beneficial.len() as f64;

        avg_benefit.max(0.0)
    }

    /// Execute repartition
    pub fn repartition(
        &mut self,
        partition_id: u64,
        current_time_ms: u64,
    ) -> Result<(), &'static str> {
        if self.is_repartitioning {
            return Err("Repartition already in progress");
        }

        let partition = self
            .partitions
            .get(&partition_id)
            .ok_or("Partition not found")?;

        let pressure = partition.pressure();
        let old_size = partition.size_blocks;
        let epsilon_before = self.current_epsilon;

        // Determine new size based on pressure
        let new_size = if pressure > self.threshold_pressure.value {
            // Grow partition
            let growth = (old_size as f64 * self.growth_increment.value) as u64;
            (old_size + growth).min(partition.max_size_blocks)
        } else {
            // Shrink partition
            let shrink = (old_size as f64 * self.shrink_increment.value) as u64;
            (old_size.saturating_sub(shrink)).max(partition.min_size_blocks)
        };

        if new_size == old_size {
            return Ok(()); // No change needed
        }

        crate::lcpfs_println!(
            "[ PARTITION] Resizing partition {} from {} to {} blocks (pressure={:.2})",
            partition_id,
            old_size,
            new_size,
            pressure
        );

        self.is_repartitioning = true;
        let start_time = crate::get_time();

        // Actually resize the partition (allocate/deallocate blocks)
        let blocks_changed = if new_size > old_size {
            // Growing: allocate additional blocks
            let blocks_to_add = new_size - old_size;
            self.allocate_blocks(partition_id, blocks_to_add)
        } else {
            // Shrinking: deallocate blocks
            let blocks_to_remove = old_size - new_size;
            self.deallocate_blocks(partition_id, blocks_to_remove)
        };

        let time_taken_ms = (crate::get_time() - start_time) / 1_000_000;

        // Update partition size
        if let Some(p) = self.partitions.get_mut(&partition_id) {
            p.size_blocks = new_size;
            p.last_resize_ms = current_time_ms;
        }

        self.is_repartitioning = false;

        crate::lcpfs_println!(
            "[ PARTITION] Resize complete: {} blocks changed in {} ms",
            blocks_changed,
            time_taken_ms
        );

        // Record outcome for learning
        let outcome = RepartitionOutcome {
            partition_id,
            old_size_blocks: old_size,
            new_size_blocks: new_size,
            pressure_at_decision: pressure,
            epsilon_before,
            epsilon_after: self.current_epsilon,
            time_taken_ms,
        };

        self.learn_from_outcome(&outcome);
        self.outcomes.push_back(outcome);

        while self.outcomes.len() > 100 {
            self.outcomes.pop_front();
        }

        Ok(())
    }

    /// Learn from repartition outcome
    fn learn_from_outcome(&mut self, outcome: &RepartitionOutcome) {
        let delta = outcome.delta_epsilon();

        // Learn pressure threshold
        self.threshold_pressure
            .observe(outcome.pressure_at_decision, delta);

        // If repartition was bad, increase threshold (be more conservative)
        if !outcome.was_beneficial() {
            self.threshold_pressure
                .observe(outcome.pressure_at_decision * 1.5, 0.0);
        }

        // Learn growth/shrink increments
        let size_change_ratio = (outcome.new_size_blocks as f64 - outcome.old_size_blocks as f64)
            / outcome.old_size_blocks as f64;

        if size_change_ratio > 0.0 {
            // Was a growth
            self.growth_increment
                .observe(size_change_ratio.abs(), delta);
        } else {
            // Was a shrink
            self.shrink_increment
                .observe(size_change_ratio.abs(), delta);
        }
    }

    /// Allocate blocks for partition growth
    fn allocate_blocks(&mut self, partition_id: u64, count: u64) -> u64 {
        use crate::BLOCK_DEVICES;

        let mut allocated = 0u64;
        let mut devices = match BLOCK_DEVICES.try_lock() {
            Some(d) => d,
            None => return 0,
        };

        if let Some(dev) = devices.get_mut(0) {
            // In a full implementation:
            // 1. Query metaslab allocator for free blocks
            // 2. Mark blocks as allocated to this partition
            // 3. Update space maps and allocation bitmaps

            // Simplified: just verify we can write to the new blocks
            let base_block = partition_id * 10000; // Each partition gets 10k block range
            for i in 0..count.min(100) {
                // Limit to prevent long operations
                let block_num = (base_block + i) as usize;
                let buffer = [0u8; 512];
                if dev.write_block(block_num, &buffer).is_ok() {
                    allocated += 1;
                }
            }
        }

        crate::lcpfs_println!(
            "[ PARTITION] Allocated {} blocks for partition {}",
            allocated,
            partition_id
        );
        allocated
    }

    /// Deallocate blocks for partition shrinkage
    fn deallocate_blocks(&mut self, partition_id: u64, count: u64) -> u64 {
        // In a full implementation:
        // 1. Verify blocks are not in use (check reference counts)
        // 2. Mark blocks as free in metaslab allocator
        // 3. Update space maps
        // 4. TRIM/discard if supported by device

        // Simplified: just count the blocks we would free
        let freed = count.min(100);
        crate::lcpfs_println!(
            "[ PARTITION] Freed {} blocks from partition {}",
            freed,
            partition_id
        );
        freed
    }

    /// Get current statistics
    pub fn stats(&self) -> PartitionStats {
        let total_partitions = self.partitions.len();
        let total_epsilon: f64 = self
            .partitions
            .values()
            .map(|p| p.epsilon_contribution())
            .sum();

        PartitionStats {
            total_partitions,
            total_pool_blocks: self.total_pool_blocks,
            total_epsilon,
            is_repartitioning: self.is_repartitioning,
            pressure_threshold: self.threshold_pressure.value,
            pressure_confidence: self.threshold_pressure.confidence(),
        }
    }
}

/// Statistics snapshot of the partition engine state
#[derive(Debug, Clone, Copy)]
pub struct PartitionStats {
    /// Number of active partitions
    pub total_partitions: usize,
    /// Total blocks available in the pool
    pub total_pool_blocks: u64,
    /// Sum of epsilon contributions from all partitions
    pub total_epsilon: f64,
    /// Whether repartitioning is currently in progress
    pub is_repartitioning: bool,
    /// Current learned pressure threshold value
    pub pressure_threshold: f64,
    /// Confidence level in the pressure threshold
    pub pressure_confidence: f64,
}

// ═══════════════════════════════════════════════════════════════════════════
// PUBLIC API
// ═══════════════════════════════════════════════════════════════════════════

/// Update system epsilon
pub fn update_epsilon(epsilon: f64) {
    PARTITION_ENGINE.lock().update_epsilon(epsilon);
}

/// Create a new partition
pub fn create_partition(
    name: &str,
    size_blocks: u64,
    min_size_blocks: u64,
    max_size_blocks: u64,
) -> Result<u64, &'static str> {
    PARTITION_ENGINE
        .lock()
        .create_partition(name, size_blocks, min_size_blocks, max_size_blocks)
}

/// Update partition statistics
pub fn update_stats(
    partition_id: u64,
    access_frequency: f64,
    avg_latency_us: f64,
    utilization: f64,
) {
    PARTITION_ENGINE.lock().update_stats(
        partition_id,
        access_frequency,
        avg_latency_us,
        utilization,
    );
}

/// Check if PI thinks we should repartition
pub fn should_repartition(partition_id: u64, current_time_ms: u64) -> bool {
    PARTITION_ENGINE
        .lock()
        .should_repartition(partition_id, current_time_ms)
}

/// Execute repartition
pub fn repartition(partition_id: u64, current_time_ms: u64) -> Result<(), &'static str> {
    PARTITION_ENGINE
        .lock()
        .repartition(partition_id, current_time_ms)
}

/// Get current statistics
pub fn stats() -> PartitionStats {
    PARTITION_ENGINE.lock().stats()
}