noos 0.4.1

Reliability layer for Rust LLM agents: scope drift, cost circuit breaks, and procedural correction memory as event-driven Decisions.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
//! Synthetic task-eval harness (Tier 1.1 from docs/task-eval-design.md).
//!
//! Run: `cargo run --example task_eval_synthetic`
//!
//! Extends the single-category `agent_simulation.rs` to a MULTI-CATEGORY,
//! MULTI-SEED comparison that reports mean AND standard deviation — the
//! statistical bar set by CR5 step 1.
//!
//! ## What this harness tests
//!
//! Claim: Noos's cross-session LearnedState lets warm-start agents skip the
//! discovery phase that simple-retry agents must repeat for each category.
//!
//! Setup:
//! - 3 task categories, each with one correct strategy.
//! - A training session pre-exposes Noos to all categories → exported LearnedState.
//! - Eval: K-turn sequence with category order shuffled by seed.
//! - Baseline: simple quality-threshold retry, no Noos.
//! - Noos warm: imports training LearnedState, uses `signals.strategy`.
//! - ≥3 seeds. Report per-seed and aggregated mean ± stddev.
//!
//! ## What this harness is (and isn't)
//!
//! - IS: a skeleton demonstrating the fair-comparison shape. Runs quickly,
//!   no external deps, no candle.
//! - IS NOT: a validation of Noos. The task is synthetic and deterministic;
//!   the simulated LLM returns hand-authored qualities. A positive result
//!   here shows the harness works; a negative result shows Noos's
//!   cross-session story fails even in the favorable synthetic case.
//! - For real validation, Tier 2 benchmarks (LoCoMo, MetaMedQA) apply.
//!
//! ## Interpreting results
//!
//! See `docs/task-eval-design.md` §5 for what would count as "claim validated".
//! Headline: Noos warm must beat simple retry by ≥2 stddev on time-to-first-correct
//! to be anything more than noise on this synthetic task.

use noos::session::CognitiveSession;
use noos::types::world::{LearnedState, ResponseStrategy};
use std::collections::BTreeMap;

// ─── Task categories and strategies ───────────────────────────────────────

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
enum Category {
    NumericalDebug,
    QuickLookup,
    AmbiguousRequest,
}

impl Category {
    fn all() -> [Category; 3] {
        [
            Category::NumericalDebug,
            Category::QuickLookup,
            Category::AmbiguousRequest,
        ]
    }

    fn user_query(self) -> &'static str {
        match self {
            Category::NumericalDebug => "Help me debug this numerical issue.",
            Category::QuickLookup => "What is the default port for Postgres?",
            Category::AmbiguousRequest => "Make this better somehow.",
        }
    }

    fn correct_strategy(self) -> AppStrategy {
        match self {
            Category::NumericalDebug => AppStrategy::StepByStep,
            Category::QuickLookup => AppStrategy::DirectAnswer,
            Category::AmbiguousRequest => AppStrategy::AskClarifying,
        }
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum AppStrategy {
    DirectAnswer,
    AskClarifying,
    StepByStep,
}

impl AppStrategy {
    fn next(self) -> Self {
        match self {
            Self::DirectAnswer => Self::AskClarifying,
            Self::AskClarifying => Self::StepByStep,
            Self::StepByStep => Self::DirectAnswer,
        }
    }
}

/// Deterministic simulated LLM. Quality depends on (strategy, category) match.
fn simulate_llm(strategy: AppStrategy, category: Category) -> (String, f64) {
    let correct = category.correct_strategy();
    let quality = if strategy == correct {
        0.90 // right approach
    } else {
        match (strategy, correct) {
            // Near-misses yield partial quality.
            (AppStrategy::AskClarifying, _) => 0.30,
            (AppStrategy::StepByStep, _) => 0.25,
            (AppStrategy::DirectAnswer, _) => 0.15,
        }
    };
    // Response formats matter: Noos's `detect_response_strategy` uses regex
    // anchors that require specific shapes (multi-line numbered for StepByStep,
    // ≥2 question marks for ClarifyFirst). These responses are chosen to
    // unambiguously trigger the intended detection — a REAL app running against
    // real LLMs should prompt its model toward these formats when it wants
    // Noos to recognize the strategy correctly. See `detect_response_strategy`
    // in src/cognition/detector.rs for exact patterns.
    let text = match strategy {
        AppStrategy::DirectAnswer => "Short answer.".to_string(),
        AppStrategy::AskClarifying => {
            "What exactly isn't working? What error do you see?".to_string()
        }
        AppStrategy::StepByStep => "1. First, check the bounds.\n\
             2. Then, verify the index.\n\
             3. Next, trace a small example.\n\
             4. Finally, add assertions."
            .to_string(),
    };
    (text, quality)
}

fn strategy_cost(s: AppStrategy) -> f64 {
    match s {
        AppStrategy::DirectAnswer => 0.1,
        AppStrategy::AskClarifying => 0.3,
        AppStrategy::StepByStep => 0.8,
    }
}

// ─── Task sequence generation (deterministic from seed) ───────────────────

/// xorshift64 for reproducible, dependency-free sequence shuffling.
fn rand_u64(state: &mut u64) -> u64 {
    let mut x = *state;
    x ^= x << 13;
    x ^= x >> 7;
    x ^= x << 17;
    *state = x;
    x
}

/// Generate a shuffled K-turn sequence drawing from all categories roughly evenly.
fn generate_sequence(seed: u64, turns: usize) -> Vec<Category> {
    let cats = Category::all();
    let mut seq: Vec<Category> = (0..turns).map(|i| cats[i % cats.len()]).collect();
    // Fisher-Yates shuffle with seeded xorshift.
    let mut state = seed.wrapping_mul(0x9E3779B97F4A7C15).wrapping_add(1);
    for i in (1..seq.len()).rev() {
        let j = (rand_u64(&mut state) as usize) % (i + 1);
        seq.swap(i, j);
    }
    seq
}

// ─── Metrics ──────────────────────────────────────────────────────────────

#[derive(Debug, Default, Clone)]
struct EvalMetrics {
    avg_quality: f64,
    total_cost: f64,
    /// Per-category: turn index (within this run) of first correct choice. Missing = never.
    time_to_first_correct: BTreeMap<Category, Option<usize>>,
}

impl EvalMetrics {
    /// Mean turn-index of first correct strategy across categories. Lower is better.
    /// Categories that never saw the correct strategy count as `penalty` (typically 2*n_turns)
    /// so stddev stays finite while still ranking "never" worse than any real turn index.
    fn first_correct_summary(&self, penalty: f64) -> f64 {
        let vals: Vec<f64> = self
            .time_to_first_correct
            .values()
            .map(|o| o.map(|t| t as f64).unwrap_or(penalty))
            .collect();
        if vals.is_empty() {
            penalty
        } else {
            vals.iter().sum::<f64>() / vals.len() as f64
        }
    }
}

fn record_correct(
    metrics: &mut EvalMetrics,
    cat: Category,
    strategy: AppStrategy,
    turn_idx: usize,
) {
    let entry = metrics.time_to_first_correct.entry(cat).or_insert(None);
    if entry.is_none() && strategy == cat.correct_strategy() {
        *entry = Some(turn_idx);
    }
}

// ─── Baseline: simple-retry, no Noos ──────────────────────────────────────

const QUALITY_THRESHOLD_FOR_RETRY: f64 = 0.5;

/// Per-category strategy memory at application level (no Noos).
/// Models a plain app that remembers "what worked last time" per category,
/// which is a stronger baseline than a single strategy for all.
fn run_baseline(sequence: &[Category]) -> EvalMetrics {
    let mut metrics = EvalMetrics::default();
    let mut per_category: BTreeMap<Category, AppStrategy> = BTreeMap::new();
    let mut qualities = Vec::new();

    for (turn_idx, &category) in sequence.iter().enumerate() {
        // Start with app-memory choice, else DirectAnswer. Rotate on prior low quality.
        let entry = per_category
            .entry(category)
            .or_insert(AppStrategy::DirectAnswer);
        let strategy = *entry;

        let (_resp, quality) = simulate_llm(strategy, category);
        qualities.push(quality);
        metrics.total_cost += strategy_cost(strategy);
        record_correct(&mut metrics, category, strategy, turn_idx);

        // App decides next attempt on this category: if quality was low, rotate.
        if quality < QUALITY_THRESHOLD_FOR_RETRY {
            *per_category.get_mut(&category).expect("entry inserted") = strategy.next();
        }
    }

    metrics.avg_quality = qualities.iter().sum::<f64>() / qualities.len() as f64;
    for cat in Category::all() {
        metrics.time_to_first_correct.entry(cat).or_insert(None);
    }
    metrics
}

// ─── Noos warm-start: imports LearnedState from training ─────────────────

fn train_prior_session() -> LearnedState {
    let mut session = CognitiveSession::new();
    // Train by seeing correct strategy for every category, multiple turns each.
    for _round in 0..6 {
        for cat in Category::all() {
            let _ = session.process_message(cat.user_query());
            let (resp, quality) = simulate_llm(cat.correct_strategy(), cat);
            session.track_cost(strategy_cost(cat.correct_strategy()));
            session.process_response(&resp, quality);
        }
    }
    session.export_learned()
}

fn map_rec(r: ResponseStrategy, fallback: AppStrategy) -> AppStrategy {
    match r {
        ResponseStrategy::StepByStep => AppStrategy::StepByStep,
        ResponseStrategy::ClarifyFirst => AppStrategy::AskClarifying,
        ResponseStrategy::DirectAnswer => AppStrategy::DirectAnswer,
        _ => fallback,
    }
}

fn run_nous_warm(sequence: &[Category], training: LearnedState) -> EvalMetrics {
    let mut session = CognitiveSession::with_learned(training, 64);
    let mut metrics = EvalMetrics::default();
    let mut qualities = Vec::new();
    let mut per_category_last_quality: BTreeMap<Category, f64> = BTreeMap::new();
    let mut per_category_choice: BTreeMap<Category, AppStrategy> = BTreeMap::new();

    for (turn_idx, &category) in sequence.iter().enumerate() {
        let turn = session.process_message(category.user_query());

        let current = per_category_choice
            .entry(category)
            .or_insert(AppStrategy::DirectAnswer);

        // Prefer Noos's learned recommendation; fall back to per-category retry.
        let strategy = if let Some(rec) = turn.signals.strategy {
            map_rec(rec, *current)
        } else if let Some(&q) = per_category_last_quality.get(&category) {
            if q < QUALITY_THRESHOLD_FOR_RETRY {
                current.next()
            } else {
                *current
            }
        } else {
            *current
        };
        *current = strategy;

        let (resp, quality) = simulate_llm(strategy, category);
        qualities.push(quality);
        metrics.total_cost += strategy_cost(strategy);
        per_category_last_quality.insert(category, quality);
        record_correct(&mut metrics, category, strategy, turn_idx);

        // Close the allostatic loop so warm session keeps refining.
        session.track_cost(strategy_cost(strategy));
        session.process_response(&resp, quality);
    }

    metrics.avg_quality = qualities.iter().sum::<f64>() / qualities.len() as f64;
    for cat in Category::all() {
        metrics.time_to_first_correct.entry(cat).or_insert(None);
    }
    metrics
}

// ─── Aggregation & reporting (mean + stddev) ──────────────────────────────

#[derive(Debug)]
struct AggregatedMetrics {
    avg_quality_mean: f64,
    avg_quality_std: f64,
    total_cost_mean: f64,
    total_cost_std: f64,
    first_correct_mean: f64,
    first_correct_std: f64,
}

fn aggregate(runs: &[EvalMetrics], penalty: f64) -> AggregatedMetrics {
    let n = runs.len() as f64;
    let mean_of = |f: &dyn Fn(&EvalMetrics) -> f64| runs.iter().map(f).sum::<f64>() / n;
    let std_of = |mean: f64, f: &dyn Fn(&EvalMetrics) -> f64| {
        let var = runs
            .iter()
            .map(|r| {
                let d = f(r) - mean;
                d * d
            })
            .sum::<f64>()
            / n.max(1.0);
        var.sqrt()
    };
    let q: &dyn Fn(&EvalMetrics) -> f64 = &|r| r.avg_quality;
    let c: &dyn Fn(&EvalMetrics) -> f64 = &|r| r.total_cost;
    let f: &dyn Fn(&EvalMetrics) -> f64 = &|r| r.first_correct_summary(penalty);
    let quality_mean = mean_of(q);
    let cost_mean = mean_of(c);
    let first_mean = mean_of(f);
    AggregatedMetrics {
        avg_quality_mean: quality_mean,
        avg_quality_std: std_of(quality_mean, q),
        total_cost_mean: cost_mean,
        total_cost_std: std_of(cost_mean, c),
        first_correct_mean: first_mean,
        first_correct_std: std_of(first_mean, f),
    }
}

fn print_aggregated(label: &str, a: &AggregatedMetrics) {
    println!(
        "  {:<28} quality={:.3}±{:.3}   cost={:.2}±{:.2}   first_correct={:.2}±{:.2}",
        label,
        a.avg_quality_mean,
        a.avg_quality_std,
        a.total_cost_mean,
        a.total_cost_std,
        a.first_correct_mean,
        a.first_correct_std
    );
}

// ─── Main ─────────────────────────────────────────────────────────────────

fn main() {
    println!("╔══════════════════════════════════════════════════════════════╗");
    println!("║  task_eval_synthetic — multi-category, multi-seed harness   ║");
    println!("╚══════════════════════════════════════════════════════════════╝\n");
    println!("Tier 1.1 from docs/task-eval-design.md.");
    println!("Synthetic task — skeleton, not a validation. See doc §2 bar.\n");

    let turns = 12; // 4 per category in expectation
    let seeds: &[u64] = &[42, 7, 1337]; // ≥3 seeds (CR5 step 1)
    // Penalty for "never saw correct strategy this run": 2× turns, clearly worse
    // than any real turn index while keeping stddev finite.
    let never_penalty: f64 = (turns * 2) as f64;

    let training = train_prior_session();
    println!(
        "Training exported LearnedState: {} strategy entries, tick={}",
        training.response_strategies.len(),
        training.tick
    );
    println!("Eval: {} turns per run × {} seeds\n", turns, seeds.len());

    let mut baseline_runs = Vec::new();
    let mut nous_runs = Vec::new();

    println!("Per-seed results:");
    println!(
        "  {:<8} {:<28} {:>8} {:>8} {:>14}",
        "seed", "condition", "quality", "cost", "first_correct"
    );
    println!("  {}", "".repeat(72));

    for &seed in seeds {
        let sequence = generate_sequence(seed, turns);

        let baseline = run_baseline(&sequence);
        let noos = run_nous_warm(&sequence, training.clone());

        println!(
            "  {:<8} {:<28} {:>8.3} {:>8.2} {:>14.2}",
            seed,
            "baseline (simple retry)",
            baseline.avg_quality,
            baseline.total_cost,
            baseline.first_correct_summary(never_penalty)
        );
        println!(
            "  {:<8} {:<28} {:>8.3} {:>8.2} {:>14.2}",
            seed,
            "noos warm",
            noos.avg_quality,
            noos.total_cost,
            noos.first_correct_summary(never_penalty)
        );

        baseline_runs.push(baseline);
        nous_runs.push(noos);
    }

    let baseline_agg = aggregate(&baseline_runs, never_penalty);
    let nous_agg = aggregate(&nous_runs, never_penalty);

    println!("\nAggregated (mean ± stddev):");
    print_aggregated("baseline (simple retry)", &baseline_agg);
    print_aggregated("noos warm", &nous_agg);

    // Report comparison with the 2-stddev bar from task-eval-design.md §5.
    println!("\n2-stddev bar check (Noos must exceed baseline by ≥2 baseline stddev):");
    let quality_delta = nous_agg.avg_quality_mean - baseline_agg.avg_quality_mean;
    let first_correct_delta = baseline_agg.first_correct_mean - nous_agg.first_correct_mean;

    let quality_two_std = 2.0 * baseline_agg.avg_quality_std;
    let first_correct_two_std = 2.0 * baseline_agg.first_correct_std;

    println!(
        "  avg_quality:   delta = {:+.3}   threshold (2σ of baseline) = {:.3}  -> {}",
        quality_delta,
        quality_two_std,
        if quality_delta > quality_two_std {
            "PASS"
        } else {
            "NOT VALIDATED"
        }
    );
    println!(
        "  first_correct: delta = {:+.2}   threshold (2σ of baseline) = {:.2}  -> {}",
        first_correct_delta,
        first_correct_two_std,
        if first_correct_delta > first_correct_two_std {
            "PASS"
        } else {
            "NOT VALIDATED"
        }
    );

    println!("\nNotes:");
    println!("  - This is a synthetic task. PASS here = harness behaves, not 'Noos works'.");
    println!("  - 3 seeds is the MINIMUM per CR5. Real claims want more.");
    println!("  - Tier 2 benchmarks (LoCoMo, MetaMedQA) are what publishable claims require.");
    println!("  - If Noos FAILS on this synthetic favorable case, the cross-session");
    println!("    claim is not ready — don't ship the claim.");
}