Skip to main content

plato_lab_guard/
lib.rs

1//! plato-lab-guard — Unfakeable constraint lab with Achievement Loss scoring
2
3use std::collections::HashMap;
4
5// ── Hypothesis ───────────────────────────────────────────
6
7#[derive(Debug, Clone)]
8pub struct Hypothesis {
9    pub id: String,
10    pub claim: String,
11    pub conditions: Vec<String>,
12    pub threshold: f32,       // max acceptable Achievement Loss
13    pub submitted_by: String,
14    pub status: HypothesisStatus,
15    pub gate_violations: Vec<String>,
16}
17
18#[derive(Debug, Clone, Copy, PartialEq, Eq)]
19pub enum HypothesisStatus {
20    Pending,
21    Gated,
22    Testing,
23    Confirmed,
24    Falsified,
25    Inconclusive,
26}
27
28impl Hypothesis {
29    pub fn new(id: &str, claim: &str, threshold: f32) -> Self {
30        Self {
31            id: id.to_string(),
32            claim: claim.to_string(),
33            conditions: Vec::new(),
34            threshold,
35            submitted_by: String::new(),
36            status: HypothesisStatus::Pending,
37            gate_violations: Vec::new(),
38        }
39    }
40
41    pub fn with_conditions(mut self, conditions: Vec<String>) -> Self {
42        self.conditions = conditions;
43        self
44    }
45
46    pub fn with_submitter(mut self, name: &str) -> Self {
47        self.submitted_by = name.to_string();
48        self
49    }
50}
51
52// ── Experiment Result ────────────────────────────────────
53
54#[derive(Debug, Clone)]
55pub struct ExperimentResult {
56    pub hypothesis_id: String,
57    pub comprehension: f32,     // 0.0-1.0
58    pub generalization: f32,    // 0.0-1.0
59    pub retention: f32,         // 0.0-1.0
60    pub raw_accuracy: f32,      // the cherry-pickable metric
61    pub details: String,
62}
63
64impl ExperimentResult {
65    /// Calculate Achievement Loss
66    pub fn achievement_loss(&self) -> f32 {
67        let product = self.comprehension * self.generalization * self.retention;
68        1.0 - product
69    }
70}
71
72// ── Verdict ──────────────────────────────────────────────
73
74#[derive(Debug, Clone)]
75pub struct Verdict {
76    pub hypothesis_id: String,
77    pub status: HypothesisStatus,
78    pub achievement_loss: f32,
79    pub threshold: f32,
80    pub passed: bool,
81    pub raw_accuracy: f32,
82    pub warning: String,
83    pub details: String,
84}
85
86// ── Gate Check ───────────────────────────────────────────
87
88#[derive(Debug, Clone, PartialEq, Eq)]
89pub enum GateResult {
90    Pass,
91    Fail(String),
92}
93
94// ── Lab Guard ────────────────────────────────────────────
95
96pub struct LabGuard {
97    hypotheses: HashMap<String, Hypothesis>,
98    results: HashMap<String, ExperimentResult>,
99    verdicts: Vec<Verdict>,
100    loss_threshold: f32,
101}
102
103impl LabGuard {
104    pub fn new() -> Self {
105        Self {
106            hypotheses: HashMap::new(),
107            results: HashMap::new(),
108            verdicts: Vec::new(),
109            loss_threshold: 0.4,
110        }
111    }
112
113    pub fn with_loss_threshold(mut self, threshold: f32) -> Self {
114        self.loss_threshold = threshold;
115        self
116    }
117
118    // ── Gates ──
119
120    fn check_gates(&self, hyp: &Hypothesis) -> Vec<GateResult> {
121        let mut gates = Vec::new();
122
123        // Gate 1: Well-formed — must have claim, conditions, positive threshold
124        if hyp.claim.is_empty() {
125            gates.push(GateResult::Fail("Missing claim".to_string()));
126        } else if hyp.conditions.is_empty() {
127            gates.push(GateResult::Fail("Missing conditions".to_string()));
128        } else if hyp.threshold <= 0.0 || hyp.threshold >= 1.0 {
129            gates.push(GateResult::Fail("Threshold must be 0.0 < t < 1.0".to_string()));
130        } else {
131            gates.push(GateResult::Pass);
132        }
133
134        // Gate 2: Falsifiable — no absolute words
135        // Extended from ct-lab: JC1's lab rejected "DCS always improves fitness"
136        // because absolute quantifiers make claims unfalsifiable.
137        let absolutes = [
138            "always", "never", "all", "none", "every single",
139            "impossible to fail", "guaranteed", "proven", "unquestionably",
140            "without exception", "invariably", "universally",
141        ];
142        let claim_lower = hyp.claim.to_lowercase();
143        // Check for whole-word matches only ("overall" should not trigger "all")
144        let has_absolute = absolutes.iter().any(|a| {
145            let pat = format!(" {} ", a); // surrounded by spaces
146            claim_lower.contains(&pat) ||
147            claim_lower.starts_with(&format!("{} ", a)) ||
148            claim_lower.ends_with(&format!(" {}", a)) ||
149            claim_lower.as_str() == *a
150        });
151        if has_absolute {
152            gates.push(GateResult::Fail(format!(
153                "Absolute quantifier detected — claims must be falsifiable under specific conditions. Check for: {:?}",
154                absolutes
155            )));
156        } else {
157            gates.push(GateResult::Pass);
158        }
159
160        // Gate 2b: Vague causation — claims about proportionality must specify the mechanism
161        // Extended from ct-lab: "DCS benefit is inversely proportional to perception range"
162        // was too vague — needs the mechanism of action.
163        let vague_patterns = [
164            "inversely proportional", "directly proportional",
165            "proportional to", "correlated with", "depends on",
166        ];
167        let has_vague = vague_patterns.iter().any(|p| claim_lower.contains(p));
168        // Allow if claim also has specific conditions (numbers, mechanisms)
169        let has_specifics = hyp.conditions.iter().any(|c| {
170            c.contains(|ch: char| ch.is_ascii_digit()) ||
171            c.contains("because") || c.contains("due to") || c.contains("via") ||
172            c.contains("mechanism") || c.contains("threshold")
173        });
174        if has_vague && !has_specifics {
175            gates.push(GateResult::Fail(
176                "Vague causation — claims about proportionality must specify the mechanism. \n
177                 Add conditions with numbers or causal explanation (because, due to, via, mechanism).".to_string()
178            ));
179        } else {
180            gates.push(GateResult::Pass);
181        }
182
183        // Gate 3: Novel — not already submitted or tested
184        if self.hypotheses.contains_key(&hyp.id) || self.verdicts.iter().any(|v| v.hypothesis_id == hyp.id) {
185            gates.push(GateResult::Fail("Already tested".to_string()));
186        } else {
187            gates.push(GateResult::Pass);
188        }
189
190        // Gate 4: Bounded — threshold is a positive number
191        if hyp.threshold <= 0.0 {
192            gates.push(GateResult::Fail("Threshold must be positive".to_string()));
193        } else {
194            gates.push(GateResult::Pass);
195        }
196
197        gates
198    }
199
200    // ── Operations ──
201
202    pub fn submit(&mut self, mut hypothesis: Hypothesis) -> GateResult {
203        let gates = self.check_gates(&hypothesis);
204        let failures: Vec<String> = gates.iter()
205            .filter_map(|g| if let GateResult::Fail(r) = g { Some(r.clone()) } else { None })
206            .collect();
207
208        if failures.is_empty() {
209            hypothesis.status = HypothesisStatus::Gated;
210            hypothesis.gate_violations = Vec::new();
211            self.hypotheses.insert(hypothesis.id.clone(), hypothesis);
212            GateResult::Pass
213        } else {
214            hypothesis.gate_violations = failures.clone();
215            hypothesis.status = HypothesisStatus::Pending;
216            self.hypotheses.insert(hypothesis.id.clone(), hypothesis);
217            GateResult::Fail(failures.join("; "))
218        }
219    }
220
221    pub fn evaluate(&mut self, result: &ExperimentResult) -> Option<Verdict> {
222        let hyp = self.hypotheses.get_mut(&result.hypothesis_id)?;
223        hyp.status = HypothesisStatus::Testing;
224
225        let loss = result.achievement_loss();
226        let passed = loss <= hyp.threshold;
227
228        // Detect cherry-picking: high raw_accuracy but high loss
229        let cherry_pick_warning = if result.raw_accuracy > 0.95 && loss > 0.3 {
230            format!("CHERRY-PICK WARNING: raw_accuracy {:.2} but loss {:.2} — results may be cherry-picked",
231                result.raw_accuracy, loss)
232        } else {
233            String::new()
234        };
235
236        let status = if passed {
237            HypothesisStatus::Confirmed
238        } else if loss > self.loss_threshold {
239            HypothesisStatus::Falsified
240        } else {
241            HypothesisStatus::Inconclusive
242        };
243
244        hyp.status = status;
245        self.results.insert(result.hypothesis_id.clone(), result.clone());
246
247        let verdict = Verdict {
248            hypothesis_id: result.hypothesis_id.clone(),
249            status,
250            achievement_loss: loss,
251            threshold: hyp.threshold,
252            passed,
253            raw_accuracy: result.raw_accuracy,
254            warning: cherry_pick_warning,
255            details: format!("loss={:.4} threshold={:.4} comp={:.2} gen={:.2} ret={:.2}",
256                loss, hyp.threshold, result.comprehension, result.generalization, result.retention),
257        };
258
259        self.verdicts.push(verdict.clone());
260        Some(verdict)
261    }
262
263    // ── Queries ──
264
265    pub fn hypothesis(&self, id: &str) -> Option<&Hypothesis> {
266        self.hypotheses.get(id)
267    }
268
269    pub fn result(&self, id: &str) -> Option<&ExperimentResult> {
270        self.results.get(id)
271    }
272
273    pub fn verdict(&self, id: &str) -> Option<&Verdict> {
274        self.verdicts.iter().find(|v| v.hypothesis_id == id)
275    }
276
277    pub fn confirmed_count(&self) -> usize {
278        self.verdicts.iter().filter(|v| v.status == HypothesisStatus::Confirmed).count()
279    }
280
281    pub fn falsified_count(&self) -> usize {
282        self.verdicts.iter().filter(|v| v.status == HypothesisStatus::Falsified).count()
283    }
284
285    pub fn total_evaluated(&self) -> usize {
286        self.verdicts.len()
287    }
288
289    pub fn average_loss(&self) -> f32 {
290        if self.verdicts.is_empty() { return 0.0; }
291        let sum: f32 = self.verdicts.iter().map(|v| v.achievement_loss).sum();
292        sum / self.verdicts.len() as f32
293    }
294
295    /// Get hypotheses by status
296    pub fn by_status(&self, status: HypothesisStatus) -> Vec<&Hypothesis> {
297        self.hypotheses.values().filter(|h| h.status == status).collect()
298    }
299}
300
301impl Default for LabGuard {
302    fn default() -> Self {
303        Self::new()
304    }
305}
306
307// ── Tests ────────────────────────────────────────────────
308
309#[cfg(test)]
310mod tests {
311    use super::*;
312
313    fn valid_hypothesis() -> Hypothesis {
314        Hypothesis::new("hyp-1", "Snapping to Pythagorean coordinates reduces drift below 0.001", 0.3)
315            .with_conditions(vec!["CUDA environment".to_string(), "10K iterations".to_string()])
316            .with_submitter("Forgemaster")
317    }
318
319    #[test]
320    fn test_submit_valid() {
321        let mut guard = LabGuard::new();
322        let result = guard.submit(valid_hypothesis());
323        assert!(matches!(result, GateResult::Pass));
324        assert_eq!(guard.hypothesis("hyp-1").unwrap().status, HypothesisStatus::Gated);
325    }
326
327    #[test]
328    fn test_gate_missing_claim() {
329        let mut guard = LabGuard::new();
330        let mut hyp = Hypothesis::new("hyp-x", "", 0.3);
331        hyp.conditions = vec!["test".to_string()];
332        let result = guard.submit(hyp);
333        assert!(matches!(result, GateResult::Fail(_)));
334    }
335
336    #[test]
337    fn test_gate_missing_conditions() {
338        let mut guard = LabGuard::new();
339        let hyp = Hypothesis::new("hyp-x", "Some claim", 0.3);
340        let result = guard.submit(hyp);
341        assert!(matches!(result, GateResult::Fail(_)));
342    }
343
344    #[test]
345    fn test_gate_absolute_words() {
346        let mut guard = LabGuard::new();
347        let hyp = Hypothesis::new("hyp-x", "This always produces zero drift", 0.3)
348            .with_conditions(vec!["test".to_string()]);
349        let result = guard.submit(hyp);
350        assert!(matches!(result, GateResult::Fail(_)));
351    }
352
353    #[test]
354    fn test_gate_invalid_threshold() {
355        let mut guard = LabGuard::new();
356        let hyp = Hypothesis::new("hyp-x", "Some claim", 0.0)
357            .with_conditions(vec!["test".to_string()]);
358        let result = guard.submit(hyp);
359        assert!(matches!(result, GateResult::Fail(_)));
360    }
361
362    #[test]
363    fn test_gate_novelty() {
364        let mut guard = LabGuard::new();
365        guard.submit(valid_hypothesis());
366        // Submit same ID again
367        let hyp2 = Hypothesis::new("hyp-1", "Different claim about drift", 0.3)
368            .with_conditions(vec!["test".to_string()]);
369        let result = guard.submit(hyp2);
370        assert!(matches!(result, GateResult::Fail(_)));
371    }
372
373    #[test]
374    fn test_evaluate_confirmed() {
375        let mut guard = LabGuard::new();
376        guard.submit(valid_hypothesis());
377
378        let result = ExperimentResult {
379            hypothesis_id: "hyp-1".to_string(),
380            comprehension: 0.95,
381            generalization: 0.90,
382            retention: 0.88,
383            raw_accuracy: 0.99,
384            details: "Strong results".to_string(),
385        };
386
387        let verdict = guard.evaluate(&result).unwrap();
388        assert_eq!(verdict.status, HypothesisStatus::Confirmed);
389        // loss = 1 - (0.95 * 0.90 * 0.88) = 1 - 0.7524 = 0.2476
390        assert!(verdict.achievement_loss < 0.3);
391        assert!(verdict.passed);
392    }
393
394    #[test]
395    fn test_evaluate_falsified() {
396        let mut guard = LabGuard::new();
397        guard.submit(valid_hypothesis());
398
399        let result = ExperimentResult {
400            hypothesis_id: "hyp-1".to_string(),
401            comprehension: 0.3,
402            generalization: 0.2,
403            retention: 0.1,
404            raw_accuracy: 0.95,
405            details: "Poor learning".to_string(),
406        };
407
408        let verdict = guard.evaluate(&result).unwrap();
409        assert_eq!(verdict.status, HypothesisStatus::Falsified);
410        assert!(!verdict.passed);
411    }
412
413    #[test]
414    fn test_evaluate_inconclusive() {
415        let mut guard = LabGuard::new().with_loss_threshold(0.2);
416        guard.submit(Hypothesis::new("hyp-1", "Some claim", 0.5)
417            .with_conditions(vec!["test".to_string()]));
418
419        let result = ExperimentResult {
420            hypothesis_id: "hyp-1".to_string(),
421            comprehension: 0.7,
422            generalization: 0.6,
423            retention: 0.5,
424            raw_accuracy: 0.8,
425            details: "Mixed".to_string(),
426        };
427
428        // loss = 1 - (0.7 * 0.6 * 0.5) = 1 - 0.21 = 0.79
429        let verdict = guard.evaluate(&result).unwrap();
430        assert_eq!(verdict.status, HypothesisStatus::Falsified);
431    }
432
433    #[test]
434    fn test_cherry_pick_warning() {
435        let mut guard = LabGuard::new();
436        guard.submit(valid_hypothesis());
437
438        let result = ExperimentResult {
439            hypothesis_id: "hyp-1".to_string(),
440            comprehension: 0.4,
441            generalization: 0.3,
442            retention: 0.3,
443            raw_accuracy: 0.99, // cherry-picked!
444            details: "Looks great on paper".to_string(),
445        };
446
447        let verdict = guard.evaluate(&result).unwrap();
448        assert!(!verdict.warning.is_empty());
449        assert!(verdict.warning.contains("CHERRY-PICK"));
450    }
451
452    #[test]
453    fn test_no_cherry_pick_warning() {
454        let mut guard = LabGuard::new();
455        guard.submit(valid_hypothesis());
456
457        let result = ExperimentResult {
458            hypothesis_id: "hyp-1".to_string(),
459            comprehension: 0.9,
460            generalization: 0.88,
461            retention: 0.85,
462            raw_accuracy: 0.92,
463            details: "Consistent".to_string(),
464        };
465
466        let verdict = guard.evaluate(&result).unwrap();
467        assert!(verdict.warning.is_empty());
468    }
469
470    #[test]
471    fn test_achievement_loss_formula() {
472        let result = ExperimentResult {
473            hypothesis_id: "test".to_string(),
474            comprehension: 0.5,
475            generalization: 0.5,
476            retention: 0.5,
477            raw_accuracy: 0.9,
478            details: String::new(),
479        };
480        // loss = 1 - (0.5 * 0.5 * 0.5) = 1 - 0.125 = 0.875
481        assert!((result.achievement_loss() - 0.875).abs() < 0.001);
482    }
483
484    #[test]
485    fn test_stats() {
486        let mut guard = LabGuard::new();
487        guard.submit(valid_hypothesis());
488
489        assert_eq!(guard.confirmed_count(), 0);
490        assert_eq!(guard.falsified_count(), 0);
491        assert_eq!(guard.total_evaluated(), 0);
492
493        let result = ExperimentResult {
494            hypothesis_id: "hyp-1".to_string(),
495            comprehension: 0.95,
496            generalization: 0.90,
497            retention: 0.88,
498            raw_accuracy: 0.99,
499            details: String::new(),
500        };
501        guard.evaluate(&result);
502
503        assert_eq!(guard.confirmed_count(), 1);
504        assert_eq!(guard.total_evaluated(), 1);
505        assert!(guard.average_loss() > 0.0);
506    }
507
508    #[test]
509    fn test_by_status() {
510        let mut guard = LabGuard::new();
511        guard.submit(valid_hypothesis());
512        let gated = guard.by_status(HypothesisStatus::Gated);
513        assert_eq!(gated.len(), 1);
514    }
515
516    #[test]
517    fn test_verdict_details() {
518        let mut guard = LabGuard::new();
519        guard.submit(valid_hypothesis());
520
521        let result = ExperimentResult {
522            hypothesis_id: "hyp-1".to_string(),
523            comprehension: 0.95,
524            generalization: 0.90,
525            retention: 0.88,
526            raw_accuracy: 0.99,
527            details: String::new(),
528        };
529        let verdict = guard.evaluate(&result).unwrap();
530        assert!(verdict.details.contains("loss="));
531        assert!(verdict.details.contains("comp="));
532    }
533
534    #[test]
535    fn test_multiple_hypotheses() {
536        let mut guard = LabGuard::new();
537        guard.submit(valid_hypothesis());
538        guard.submit(Hypothesis::new("hyp-2", "Constraint tightening improves precision", 0.35)
539            .with_conditions(vec!["test".to_string()]));
540
541        assert_eq!(guard.by_status(HypothesisStatus::Gated).len(), 2);
542    }
543
544    // --- ct-lab Extended Gate Tests ---
545
546    #[test]
547    fn test_gate_rejects_always() {
548        let mut guard = LabGuard::new();
549        let hyp = Hypothesis::new("always-bad", "DCS always improves fitness", 0.3)
550            .with_conditions(vec!["agents=256".to_string()]);
551        let result = guard.submit(hyp);
552        assert!(matches!(result, GateResult::Fail(_)));
553    }
554
555    #[test]
556    fn test_gate_rejects_never() {
557        let mut guard = LabGuard::new();
558        let hyp = Hypothesis::new("never-bad", "Trust decay never causes cascading failure", 0.2)
559            .with_conditions(vec!["test".to_string()]);
560        assert!(matches!(guard.submit(hyp), GateResult::Fail(_)));
561    }
562
563    #[test]
564    fn test_gate_rejects_guaranteed() {
565        let mut guard = LabGuard::new();
566        let hyp = Hypothesis::new("guaranteed-bad", "Ghost tiles are guaranteed to improve recall", 0.1)
567            .with_conditions(vec!["test".to_string()]);
568        assert!(matches!(guard.submit(hyp), GateResult::Fail(_)));
569    }
570
571    #[test]
572    fn test_gate_allows_falsifiable() {
573        let mut guard = LabGuard::new();
574        let hyp = Hypothesis::new("falsifiable", "DCS improves fitness when specialist ratio exceeds 5x", 0.3)
575            .with_conditions(vec!["agents=256, specialist_ratio=5.0".to_string()]);
576        assert!(matches!(guard.submit(hyp), GateResult::Pass));
577    }
578
579    #[test]
580    fn test_gate_rejects_overall_not_absolute() {
581        // "overall" contains "all" but shouldn't trigger absolute gate
582        let mut guard = LabGuard::new();
583        let hyp = Hypothesis::new("overall-ok", "Overall system performance improves with tiling", 0.3)
584            .with_conditions(vec!["test".to_string()]);
585        assert!(matches!(guard.submit(hyp), GateResult::Pass));
586    }
587
588    #[test]
589    fn test_gate_rejects_vague_causation() {
590        // ct-lab rejected: "DCS benefit is inversely proportional to perception range"
591        let mut guard = LabGuard::new();
592        let hyp = Hypothesis::new("vague", "DCS benefit is inversely proportional to perception range", 0.3)
593            .with_conditions(vec!["tested with simulation".to_string()]); // no mechanism, no numbers
594        assert!(matches!(guard.submit(hyp), GateResult::Fail(_)));
595    }
596
597    #[test]
598    fn test_gate_allows_specific_causation() {
599        // Same claim but with mechanism explanation
600        let mut guard = LabGuard::new();
601        let hyp = Hypothesis::new("specific", "DCS benefit is inversely proportional to perception range", 0.3)
602            .with_conditions(vec!["because specialists exploit local gradients that generalists miss".to_string()]);
603        assert!(matches!(guard.submit(hyp), GateResult::Pass));
604    }
605
606    #[test]
607    fn test_gate_allows_causation_with_numbers() {
608        // Numbers count as specifics
609        let mut guard = LabGuard::new();
610        let hyp = Hypothesis::new("with-nums", "Fitness is proportional to food density", 0.4)
611            .with_conditions(vec!["food_density > 0.5".to_string()]);
612        assert!(matches!(guard.submit(hyp), GateResult::Pass));
613    }
614}