use noos::session::CognitiveSession;
use noos::types::world::{LearnedState, ResponseStrategy};
use std::collections::BTreeMap;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
enum Category {
Debug,
Lookup,
Clarify,
Compose,
Translate,
}
impl Category {
fn user_query(self, idx: usize) -> String {
match self {
Self::Debug => format!("Help me debug this numerical issue {idx}."),
Self::Lookup => format!("What is the default port for service {idx}?"),
Self::Clarify => format!("Make system {idx} better somehow."),
Self::Compose => format!("Compose a poem about topic {idx}."),
Self::Translate => format!("Translate phrase {idx} into French."),
}
}
fn correct_strategy(self) -> AppStrategy {
match self {
Self::Debug => AppStrategy::StepByStep,
Self::Lookup => AppStrategy::DirectAnswer,
Self::Clarify => AppStrategy::AskClarifying,
Self::Compose => AppStrategy::DirectAnswer,
Self::Translate => AppStrategy::DirectAnswer,
}
}
fn is_pretrained(self) -> bool {
matches!(self, Self::Debug | Self::Lookup | Self::Clarify)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum AppStrategy {
DirectAnswer,
AskClarifying,
StepByStep,
}
fn simulate_llm(strategy: AppStrategy, category: Category) -> (String, f64) {
let correct = category.correct_strategy();
let quality = if strategy == correct {
if category.is_pretrained() { 0.90 } else { 0.40 }
} else {
0.20
};
let text = match strategy {
AppStrategy::DirectAnswer => format!("Short answer for {:?}.", category),
AppStrategy::AskClarifying => "What exactly do you need? What have you tried?".to_string(),
AppStrategy::StepByStep => {
"1. First, identify.\n2. Then, check.\n3. Finally, verify.".to_string()
}
};
(text, quality)
}
fn generate_stream() -> Vec<(Category, String)> {
let pattern = [
Category::Debug,
Category::Compose, Category::Lookup,
Category::Translate, Category::Clarify,
Category::Debug,
Category::Compose,
Category::Lookup,
Category::Translate,
Category::Clarify,
Category::Debug,
Category::Compose,
Category::Lookup,
Category::Translate,
Category::Clarify,
Category::Debug,
Category::Compose,
Category::Lookup,
Category::Translate,
Category::Clarify,
Category::Debug,
Category::Compose,
Category::Lookup,
Category::Translate,
Category::Clarify,
Category::Debug,
Category::Compose,
Category::Lookup,
Category::Translate,
Category::Clarify,
];
pattern
.iter()
.enumerate()
.map(|(i, &c)| (c, c.user_query(i)))
.collect()
}
fn train_prior_session() -> LearnedState {
let mut session = CognitiveSession::new();
let trained = [Category::Debug, Category::Lookup, Category::Clarify];
for round in 0..6 {
let idx = round; for &cat in &trained {
let _ = session.process_message(&cat.user_query(idx));
let (resp, quality) = simulate_llm(cat.correct_strategy(), cat);
session.track_cost(0.5);
session.process_response(&resp, quality);
}
}
session.export_learned()
}
#[derive(Debug, Clone, Copy)]
enum Decision {
Answer(AppStrategy),
Abstain,
}
#[derive(Debug, Default, Clone)]
struct AbstentionStats {
true_positive_correct: usize,
true_positive_wrong: usize,
false_negative: usize,
true_negative: usize,
false_positive: usize,
total_quality: f64,
}
impl AbstentionStats {
fn precision(&self) -> f64 {
let denom = self.true_negative + self.false_negative;
if denom == 0 {
0.0
} else {
self.true_negative as f64 / denom as f64
}
}
fn recall(&self) -> f64 {
let denom = self.true_negative + self.false_positive;
if denom == 0 {
0.0
} else {
self.true_negative as f64 / denom as f64
}
}
fn f1(&self) -> f64 {
let p = self.precision();
let r = self.recall();
if p + r < 1e-9 {
0.0
} else {
2.0 * p * r / (p + r)
}
}
fn total_attempts(&self) -> usize {
self.true_positive_correct
+ self.true_positive_wrong
+ self.false_positive
}
fn total_abstentions(&self) -> usize {
self.true_negative + self.false_negative
}
}
fn record(stats: &mut AbstentionStats, cat: Category, decision: Decision) {
match decision {
Decision::Answer(strategy) => {
let (_, quality) = simulate_llm(strategy, cat);
stats.total_quality += quality;
if cat.is_pretrained() {
if strategy == cat.correct_strategy() {
stats.true_positive_correct += 1;
} else {
stats.true_positive_wrong += 1;
}
} else {
stats.false_positive += 1;
}
}
Decision::Abstain => {
if cat.is_pretrained() {
stats.false_negative += 1;
} else {
stats.true_negative += 1;
}
}
}
}
fn run_always_answer(stream: &[(Category, String)]) -> AbstentionStats {
let mut stats = AbstentionStats::default();
for (cat, _) in stream {
record(&mut stats, *cat, Decision::Answer(AppStrategy::DirectAnswer));
}
stats
}
const MIN_OBSERVATIONS_FOR_ATTEMPT: usize = 3;
const ABSTAIN_QUALITY_THRESHOLD: f64 = 0.4;
fn run_smart_baseline(
stream: &[(Category, String)],
pretrained_observations: &BTreeMap<Category, (f64, usize)>,
) -> AbstentionStats {
let mut stats = AbstentionStats::default();
let mut cluster_history: BTreeMap<Category, (f64, usize)> = pretrained_observations.clone();
let mut last_strategy: BTreeMap<Category, AppStrategy> = BTreeMap::new();
for &cat in &[Category::Debug, Category::Lookup, Category::Clarify] {
last_strategy.insert(cat, cat.correct_strategy());
}
for (cat, _) in stream {
let (avg_q, count) = *cluster_history.get(cat).unwrap_or(&(0.0, 0));
let should_attempt =
count >= MIN_OBSERVATIONS_FOR_ATTEMPT && avg_q >= ABSTAIN_QUALITY_THRESHOLD;
let decision = if should_attempt {
let strat = *last_strategy.get(cat).unwrap_or(&AppStrategy::DirectAnswer);
Decision::Answer(strat)
} else {
Decision::Abstain
};
record(&mut stats, *cat, decision);
if let Decision::Answer(strat) = decision {
let (_, quality) = simulate_llm(strat, *cat);
let entry = cluster_history.entry(*cat).or_insert((0.0, 0));
let new_count = entry.1 + 1;
entry.0 = entry.0 * (entry.1 as f64) / (new_count as f64)
+ quality / (new_count as f64);
entry.1 = new_count;
}
}
stats
}
const NOUS_CONFIDENCE_ABSTAIN_THRESHOLD: f64 = 0.4;
fn map_rec(r: ResponseStrategy, fallback: AppStrategy) -> AppStrategy {
match r {
ResponseStrategy::StepByStep => AppStrategy::StepByStep,
ResponseStrategy::ClarifyFirst => AppStrategy::AskClarifying,
ResponseStrategy::DirectAnswer => AppStrategy::DirectAnswer,
_ => fallback,
}
}
fn run_nous_confidence(stream: &[(Category, String)], training: LearnedState) -> AbstentionStats {
let mut session = CognitiveSession::with_learned(training, 64);
let mut stats = AbstentionStats::default();
for (cat, text) in stream {
let turn = session.process_message(text);
let abstain = turn.signals.strategy.is_none();
let _ = NOUS_CONFIDENCE_ABSTAIN_THRESHOLD;
let decision = if abstain {
Decision::Abstain
} else {
let strat = turn
.signals
.strategy
.map(|r| map_rec(r, AppStrategy::DirectAnswer))
.unwrap_or(AppStrategy::DirectAnswer);
Decision::Answer(strat)
};
record(&mut stats, *cat, decision);
if let Decision::Answer(strat) = decision {
let (resp, quality) = simulate_llm(strat, *cat);
session.track_cost(0.5);
session.process_response(&resp, quality);
}
}
stats
}
fn print_row(name: &str, s: &AbstentionStats) {
println!(
" {:<26} attempts={:>2} abstentions={:>2} TP_corr={:>2} TP_wrong={:>2} FP={:>2} TN={:>2} FN={:>2} prec={:.2} rec={:.2} F1={:.2} q={:.2}",
name,
s.total_attempts(),
s.total_abstentions(),
s.true_positive_correct,
s.true_positive_wrong,
s.false_positive,
s.true_negative,
s.false_negative,
s.precision(),
s.recall(),
s.f1(),
s.total_quality,
);
}
fn main() {
println!("╔══════════════════════════════════════════════════════════════╗");
println!("║ task_eval_abstention — Tier 1.5 metacognition signal test ║");
println!("╚══════════════════════════════════════════════════════════════╝\n");
println!("30-query stream mixing pre-trained categories (Debug, Lookup,");
println!("Clarify) and novel categories (Compose, Translate). Tests whether");
println!("Nous's `signals.confidence` + `signals.strategy.is_none()` produce");
println!("well-calibrated abstention decisions vs a smart per-cluster tracker.\n");
let stream = generate_stream();
let pretrained_count = stream.iter().filter(|(c, _)| c.is_pretrained()).count();
let novel_count = stream.len() - pretrained_count;
println!(
"Stream: {} queries ({} pre-trained answerable, {} novel that should abstain)\n",
stream.len(),
pretrained_count,
novel_count
);
let training = train_prior_session();
let mut pretrained_obs: BTreeMap<Category, (f64, usize)> = BTreeMap::new();
for &cat in &[Category::Debug, Category::Lookup, Category::Clarify] {
pretrained_obs.insert(cat, (0.9, 6));
}
let always = run_always_answer(&stream);
let smart = run_smart_baseline(&stream, &pretrained_obs);
let nous = run_nous_confidence(&stream, training);
println!("Per-condition results:");
println!(
" {:<26} {:<60}",
"agent", "(metrics — see legend below)"
);
println!(" {}", "─".repeat(140));
print_row("always-answer (reference)", &always);
print_row("smart baseline (no Nous)", &smart);
print_row("nous-confidence", &nous);
println!("\nLegend:");
println!(" TP_corr = correctly answered pre-trained with right strategy (good)");
println!(" TP_wrong = answered pre-trained with wrong strategy (delivered to user but suboptimal)");
println!(" FP = answered novel question that should've been abstained (HARM)");
println!(" TN = correctly abstained on novel (good)");
println!(" FN = abstained on pre-trained that should've been answered (lost opportunity)");
println!(" prec = TN / (TN + FN) rec = TN / (TN + FP) F1 = harmonic mean");
println!(" q = total quality (abstentions count as 0)");
println!("\nPrimary metric — abstention F1 (higher = better calibrated):");
let smart_f1 = smart.f1();
let nous_f1 = nous.f1();
let delta_f1 = nous_f1 - smart_f1;
if delta_f1.abs() < 0.05 {
println!(
" ≈ Tied: smart F1 = {:.2}, nous F1 = {:.2} (Δ = {:+.2})",
smart_f1, nous_f1, delta_f1
);
} else if delta_f1 > 0.0 {
println!(
" ✓ Nous F1 = {:.2}, smart F1 = {:.2} (Nous better by {:+.2})",
nous_f1, smart_f1, delta_f1
);
} else {
println!(
" ⚠ Nous F1 = {:.2}, smart F1 = {:.2} (smart better by {:+.2})",
nous_f1, smart_f1, -delta_f1
);
}
println!("\nSecondary — harm comparison (lower FP = less wrong-answer harm):");
println!(
" always-answer: FP={} smart: FP={} nous: FP={}",
always.false_positive, smart.false_positive, nous.false_positive
);
println!("\nNotes:");
println!(" • Synthetic task — illustrates whether the SIGNAL is decision-grade.");
println!(" Real validation would use MetaMedQA or similar.");
println!(" • Smart baseline is given fair starting state (pre-populated cluster");
println!(" history matching Nous's training). Both start with equal info.");
println!(" • If Nous matches smart baseline F1, the metacognition claim from");
println!(" docs/intervention.md gap #1 is infrastructure-only on this task.");
println!(" • If Nous beats smart baseline, `signals.confidence` adds discriminating");
println!(" info beyond per-cluster historical tracking.");
}