noos 0.2.1

Reliability layer for Rust LLM agents: scope drift, cost circuit breaks, and procedural correction memory as event-driven Decisions.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
//! Inference Engine — the unified brain.
//!
//! Brain analog: sensorimotor integration loop. Sensory input (tokenizer)
//! → cortical processing (model forward pass) → motor output (token sampling),
//! with neuromodulatory systems (CognitiveState) coloring every stage.
//! Key papers: Wolpert 1997 (forward models in motor control),
//! Doya 1999 (modulatory systems in sensorimotor loops).
//!
//! This is where Noos and the model become ONE system — same process,
//! same memory space. See `docs/intervention.md` Tier 1.
//!
//! Performance: per-token = model.forward() + sampler.sample() (~10-500ms).

use crate::cognition::delta_modulation::compute_delta_modulation;
use crate::errors::{NoosError, NoosResult};
use crate::inference::cognitive_model::CognitiveModel;
use crate::inference::model::LocalModel;
use crate::inference::sampler::CognitiveSampler;
use crate::inference::tokenizer::NoosTokenizer;
use crate::types::intervention::{CognitiveState, DeltaModulation};

/// Result of generating a single token.
#[derive(Debug, Clone)]
pub struct GenerationStep {
    /// The sampled token ID.
    pub token_id: u32,
    /// The decoded text fragment for this token.
    pub text: String,
    /// Whether this was the end-of-sequence token.
    pub is_eos: bool,
    /// Position in the sequence (0-indexed).
    pub position: usize,
}

/// The unified brain — model + tokenizer + cognitive sampler.
///
/// Generic over model and tokenizer to support:
/// - Real inference (CandleModel + HfTokenizer) in production
/// - Mock inference (MockModel + MockTokenizer) in tests
/// - Any future backend (ONNX, TensorRT, etc.)
pub struct InferenceEngine<M: LocalModel, T: NoosTokenizer> {
    /// The model — cortical tissue.
    model: M,
    /// The tokenizer — sensory encoder/decoder.
    tokenizer: T,
    /// Current position in the sequence (for KV cache).
    position: usize,
    /// All tokens generated so far (for repetition penalty).
    generated_tokens: Vec<u32>,
    /// Input prompt tokens (for position tracking).
    prompt_tokens: Vec<u32>,
}

impl<M: LocalModel, T: NoosTokenizer> InferenceEngine<M, T> {
    /// Create a new inference engine.
    pub fn new(model: M, tokenizer: T) -> Self {
        Self {
            model,
            tokenizer,
            position: 0,
            generated_tokens: Vec::new(),
            prompt_tokens: Vec::new(),
        }
    }

    /// Mutable: populates KV cache, initializes position and prompt tokens.
    /// Requires mutation because the model's KV cache must persist across
    /// subsequent generate_next() calls for attention context.
    pub fn set_prompt(&mut self, text: &str) -> NoosResult<()> {
        let tokens = self.tokenizer.encode(text, true)?;
        if tokens.is_empty() {
            return Err(NoosError::Internal("Empty prompt after tokenization".into()));
        }

        // Forward pass on full prompt to fill KV cache.
        self.model.forward(&tokens, 0)?;

        self.prompt_tokens = tokens;
        self.position = self.prompt_tokens.len();
        self.generated_tokens.clear();

        Ok(())
    }

    /// Mutable: advances position, appends to generated_tokens, updates KV cache.
    /// Requires mutation because auto-regressive generation is inherently
    /// stateful — each token depends on all previous tokens via KV cache.
    ///
    /// This is the core loop of the unified brain:
    /// 1. Model forward pass → raw logits (cortical output)
    /// 2. CognitiveState → SamplingOverride (neuromodulatory context)
    /// 3. CognitiveSampler applies modulation → sampled token
    ///
    /// The CognitiveState can change between calls — enabling per-token
    /// modulation as the conversation evolves.
    pub fn generate_next(
        &mut self,
        cognitive_state: &CognitiveState,
    ) -> NoosResult<GenerationStep> {
        // Determine input tokens for this step.
        let input_tokens = if self.generated_tokens.is_empty() {
            // First generation step: use last prompt token.
            // (KV cache already has full prompt from set_prompt.)
            vec![*self.prompt_tokens.last().ok_or_else(|| {
                NoosError::Internal("No prompt set".into())
            })?]
        } else {
            // Subsequent steps: feed the last generated token.
            vec![*self.generated_tokens.last().unwrap_or(&0)]
        };

        // Step 1: Forward pass → raw logits.
        let logits = self.model.forward(&input_tokens, self.position)?;

        // Step 2: Create cognitive sampler from current state.
        let sampler = CognitiveSampler::from_cognitive_state(cognitive_state);

        // Step 3: All previous tokens (prompt + generated) for repetition penalty.
        let all_tokens: Vec<u32> = self
            .prompt_tokens
            .iter()
            .chain(self.generated_tokens.iter())
            .copied()
            .collect();

        // Step 4: Sample with cognitive modulation.
        let token_id = sampler.sample(&logits, &all_tokens)?;

        // Decode and track.
        let text = self.tokenizer.decode_token(token_id)?;
        let is_eos = token_id == self.tokenizer.eos_token_id();
        let step_position = self.position;

        self.generated_tokens.push(token_id);
        self.position += 1;

        Ok(GenerationStep {
            token_id,
            text,
            is_eos,
            position: step_position,
        })
    }

    /// Mutable: calls generate_next() repeatedly, accumulating tokens.
    /// Requires mutation for same reason as generate_next().
    ///
    /// Stops when: max_tokens reached OR EOS token generated.
    /// The cognitive_state is applied uniformly to all tokens.
    /// For per-token modulation, use generate_next() in a loop.
    pub fn generate(
        &mut self,
        cognitive_state: &CognitiveState,
        max_tokens: usize,
    ) -> NoosResult<String> {
        let mut output = String::new();

        for _ in 0..max_tokens {
            let step = self.generate_next(cognitive_state)?;
            if step.is_eos {
                break;
            }
            output.push_str(&step.text);
        }

        Ok(output)
    }

    /// Mutable: clears KV cache, position, and generated tokens.
    /// Requires mutation because all generation state must be wiped
    /// between conversations to prevent cross-contamination.
    pub fn reset(&mut self) {
        self.model.reset_cache();
        self.position = 0;
        self.generated_tokens.clear();
        self.prompt_tokens.clear();
    }

    /// Access generated tokens so far (for external analysis).
    pub fn generated_tokens(&self) -> &[u32] {
        &self.generated_tokens
    }

    /// Current sequence position.
    pub fn position(&self) -> usize {
        self.position
    }
}

/// Result of generating a single token with cognitive modulation (Tầng 1 + 2).
///
/// Extends GenerationStep with delta modulation metadata (efference copy).
/// Brain analog: motor output + corollary discharge (Crapse & Sommer 2008).
#[derive(Debug, Clone)]
pub struct CognitiveGenerationStep {
    /// The sampled token ID.
    pub token_id: u32,
    /// The decoded text fragment for this token.
    pub text: String,
    /// Whether this was the end-of-sequence token.
    pub is_eos: bool,
    /// Position in the sequence (0-indexed).
    pub position: usize,
    /// Whether delta modulation was applied at the model level.
    pub modulation_applied: bool,
    /// Which layers were modulated (empty if no modulation).
    pub modulated_layers: Vec<usize>,
    /// The delta modulation that was computed for this token.
    pub delta_modulation: DeltaModulation,
    /// Tầng 3: gate blend factor (None if no gate). 0.0 = passthrough, higher = active modulation.
    /// Bottom-up signal: model telling subcortex how salient the current input is.
    pub gate_alpha: Option<f64>,
    /// Tầng 3: gate's learned delta gain (None if no gate). [0.5, 2.0].
    /// Bottom-up signal: model's state update speed preference.
    pub gate_delta_gain: Option<f64>,
}

/// Cognitive inference methods — available when the model supports cognitive forward.
///
/// These methods stack Tầng 2 (delta modulation → model internals) on top of
/// Tầng 1 (sampling override → token selection). Both tầng apply together:
/// delta modulation changes HOW the model thinks, then sampling modulation
/// changes WHAT token is selected from the resulting distribution.
///
/// Brain analog: neuromodulators (Tầng 2, NE/DA) change cortical processing,
/// THEN output gating (Tầng 1, basal ganglia) selects the action.
impl<M: CognitiveModel, T: NoosTokenizer> InferenceEngine<M, T> {
    /// Mutable: generates one token with full cognitive modulation (Tầng 1 + 2).
    ///
    /// Pipeline:
    /// 1. Compute DeltaModulation from CognitiveState (cognition layer)
    /// 2. Forward pass with delta modulation (model internalizes cognitive state)
    /// 3. CognitiveSampler modulates sampling parameters (Tầng 1, still applies)
    /// 4. Sample token from modulated distribution
    ///
    /// Tầng 2 + Tầng 1 stack — they don't replace each other.
    /// This is the unified brain: cognitive state shapes BOTH processing AND output.
    pub fn generate_next_cognitive(
        &mut self,
        cognitive_state: &CognitiveState,
    ) -> NoosResult<CognitiveGenerationStep> {
        // Determine input tokens (same logic as generate_next).
        let input_tokens = if self.generated_tokens.is_empty() {
            vec![*self.prompt_tokens.last().ok_or_else(|| {
                NoosError::Internal("No prompt set".into())
            })?]
        } else {
            vec![*self.generated_tokens.last().unwrap_or(&0)]
        };

        // Tầng 2: Compute delta modulation from cognitive state.
        let delta_mod = compute_delta_modulation(cognitive_state, self.model.num_layers());

        // Tầng 2: Forward pass with cognitive modulation.
        // The model applies gain_factor to delta at targeted layers.
        let forward_result =
            self.model
                .forward_cognitive(&input_tokens, self.position, &delta_mod)?;

        // Tầng 1: Create cognitive sampler from current state.
        // Sampling modulation STILL applies on top of delta modulation.
        let sampler = CognitiveSampler::from_cognitive_state(cognitive_state);

        // All previous tokens for repetition penalty.
        let all_tokens: Vec<u32> = self
            .prompt_tokens
            .iter()
            .chain(self.generated_tokens.iter())
            .copied()
            .collect();

        // Tầng 1: Sample with cognitive modulation.
        let token_id = sampler.sample(&forward_result.logits, &all_tokens)?;

        // Decode and track.
        let text = self.tokenizer.decode_token(token_id)?;
        let is_eos = token_id == self.tokenizer.eos_token_id();
        let step_position = self.position;

        self.generated_tokens.push(token_id);
        self.position += 1;

        Ok(CognitiveGenerationStep {
            token_id,
            text,
            is_eos,
            position: step_position,
            modulation_applied: forward_result.modulation_applied,
            modulated_layers: forward_result.modulated_layers,
            delta_modulation: delta_mod,
            gate_alpha: forward_result.gate_alpha,
            gate_delta_gain: forward_result.gate_delta_gain,
        })
    }

    /// Mutable: generates multiple tokens with cognitive modulation.
    ///
    /// Like generate(), but uses forward_cognitive() for Tầng 2.
    /// The cognitive_state is applied uniformly to all tokens.
    /// For per-token modulation, use generate_next_cognitive() in a loop.
    pub fn generate_cognitive(
        &mut self,
        cognitive_state: &CognitiveState,
        max_tokens: usize,
    ) -> NoosResult<String> {
        let mut output = String::new();

        for _ in 0..max_tokens {
            let step = self.generate_next_cognitive(cognitive_state)?;
            if step.is_eos {
                break;
            }
            output.push_str(&step.text);
        }

        Ok(output)
    }

    /// Number of layers in the underlying model.
    pub fn model_num_layers(&self) -> usize {
        self.model.num_layers()
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::inference::model::tests::MockModel;
    use crate::inference::tokenizer::tests::MockTokenizer;
    use crate::types::world::GainMode;

    fn make_engine() -> InferenceEngine<MockModel, MockTokenizer> {
        InferenceEngine::new(MockModel::new(100), MockTokenizer::new(100))
    }

    #[test]
    fn generate_next_produces_token() {
        let mut engine = make_engine();
        engine.set_prompt("hello").unwrap();

        let step = engine
            .generate_next(&CognitiveState::default())
            .unwrap();

        assert!(!step.text.is_empty());
        assert_eq!(engine.generated_tokens().len(), 1);
    }

    #[test]
    fn generate_respects_max_tokens() {
        let mut engine = make_engine();
        engine.set_prompt("hello").unwrap();

        let _result = engine.generate(&CognitiveState::default(), 5).unwrap();

        // Should generate at most 5 tokens.
        assert!(engine.generated_tokens().len() <= 5);
    }

    #[test]
    fn position_advances() {
        let mut engine = make_engine();
        engine.set_prompt("hi").unwrap();
        let initial_pos = engine.position();

        engine.generate_next(&CognitiveState::default()).unwrap();
        assert_eq!(engine.position(), initial_pos + 1);

        engine.generate_next(&CognitiveState::default()).unwrap();
        assert_eq!(engine.position(), initial_pos + 2);
    }

    #[test]
    fn reset_clears_state() {
        let mut engine = make_engine();
        engine.set_prompt("hello").unwrap();
        engine.generate_next(&CognitiveState::default()).unwrap();

        engine.reset();

        assert_eq!(engine.position(), 0);
        assert!(engine.generated_tokens().is_empty());
    }

    #[test]
    fn empty_prompt_returns_error() {
        let mut engine = make_engine();
        let result = engine.set_prompt("");
        assert!(result.is_err());
    }

    #[test]
    fn generate_without_prompt_returns_error() {
        let mut engine = make_engine();
        let result = engine.generate_next(&CognitiveState::default());
        assert!(result.is_err());
    }

    #[test]
    fn cognitive_state_affects_sampling() {
        let mut engine1 = make_engine();
        let mut engine2 = make_engine();
        engine1.set_prompt("test").unwrap();
        engine2.set_prompt("test").unwrap();

        let phasic = CognitiveState {
            gain_mode: GainMode::Phasic,
            ..CognitiveState::default()
        };
        let tonic = CognitiveState {
            gain_mode: GainMode::Tonic,
            ..CognitiveState::default()
        };

        // Both should succeed — cognitive state affects HOW tokens are sampled,
        // not WHETHER generation works.
        let step1 = engine1.generate_next(&phasic).unwrap();
        let step2 = engine2.generate_next(&tonic).unwrap();

        // With mock model, both produce valid tokens.
        assert!(step1.token_id < 100);
        assert!(step2.token_id < 100);
    }

    // ─── Tầng 2 cognitive engine tests ───

    use crate::inference::cognitive_model::tests::MockCognitiveModel;

    fn make_cognitive_engine() -> InferenceEngine<MockCognitiveModel, MockTokenizer> {
        InferenceEngine::new(MockCognitiveModel::new(100, 64), MockTokenizer::new(100))
    }

    #[test]
    fn generate_next_cognitive_produces_token() {
        let mut engine = make_cognitive_engine();
        engine.set_prompt("hello").unwrap();

        let step = engine
            .generate_next_cognitive(&CognitiveState::default())
            .unwrap();

        assert!(!step.text.is_empty());
        assert_eq!(engine.generated_tokens().len(), 1);
    }

    #[test]
    fn cognitive_generation_applies_delta_modulation() {
        let mut engine = make_cognitive_engine();
        engine.set_prompt("test").unwrap();

        let phasic = CognitiveState {
            gain_mode: GainMode::Phasic,
            ..CognitiveState::default()
        };
        let step = engine.generate_next_cognitive(&phasic).unwrap();

        assert!(
            step.modulation_applied,
            "Phasic mode should trigger delta modulation"
        );
        assert!(
            step.delta_modulation.gain_factor < 1.0,
            "Phasic should reduce delta (compensatory retention)"
        );
        assert!(
            !step.modulated_layers.is_empty(),
            "Should have modulated mid-layers"
        );
    }

    #[test]
    fn neutral_cognitive_generation_no_modulation() {
        let mut engine = make_cognitive_engine();
        engine.set_prompt("test").unwrap();

        let neutral = CognitiveState::default();
        let step = engine.generate_next_cognitive(&neutral).unwrap();

        assert!(
            !step.modulation_applied,
            "Neutral state (gain=1.0) should not modulate"
        );
    }

    #[test]
    fn cognitive_and_sampling_stack() {
        // Verify that both tầng work: model gets delta modulation,
        // sampler gets sampling override. Both from same CognitiveState.
        let mut engine = make_cognitive_engine();
        engine.set_prompt("test").unwrap();

        let tonic = CognitiveState {
            gain_mode: GainMode::Tonic,
            ..CognitiveState::default()
        };
        let step = engine.generate_next_cognitive(&tonic).unwrap();

        assert!(
            step.delta_modulation.gain_factor < 1.0,
            "Tonic should reduce delta (Tầng 2)"
        );
        // Token was still successfully sampled (Tầng 1 sampling worked).
        assert!(step.token_id < 100);
    }

    #[test]
    fn generate_cognitive_multiple_tokens() {
        let mut engine = make_cognitive_engine();
        engine.set_prompt("hello").unwrap();

        let result = engine
            .generate_cognitive(&CognitiveState::default(), 3)
            .unwrap();

        assert!(!result.is_empty());
        assert!(engine.generated_tokens().len() <= 3);
    }

    #[test]
    fn model_num_layers_accessible() {
        let engine = make_cognitive_engine();
        assert_eq!(engine.model_num_layers(), 64);
    }
}