Skip to main content

epsilon_engine/
lib.rs

1//! Pillar: III. PACR field: Γ (Cognitive Complexity).
2//!
3//! Full CSSR ε-machine inference producing [`CognitiveSplit`] (`S_T`, `H_T`) with
4//! bootstrap confidence intervals.
5//!
6//! # Quick start
7//!
8//! ```
9//! use epsilon_engine::{Config, infer};
10//!
11//! let symbols: Vec<u8> = vec![0, 1, 0, 1, 0, 1]; // tiny example
12//! let cfg = Config::default();
13//! let result = infer(&symbols, cfg);
14//! assert!(result.cognitive_split.statistical_complexity.point >= 0.0);
15//! ```
16//!
17//! # CSSR algorithm summary
18//!
19//! See `cssr.rs` for the full algorithm description.
20//! Briefly: suffix-tree statistics → KS homogeneity tests → state splitting →
21//! merge pass → compact → stationary π → (`C_μ`, `h_μ`) with B=200 bootstrap CIs.
22//!
23//! # Memory budget
24//!
25//! Suffix table entries: at most min(N, |A|^L) per depth level.
26//! For N=100 000, L=12, |A|=8: unique depth-L histories ≤ 100 000.
27//! Total entries across all depths ≤ N × L = 1 200 000.
28//! Each entry: Vec<u32> of |A|=8 elements ≈ 32 B → ~38 MiB worst-case,
29//! well under the 200 MiB budget.
30
31#![forbid(unsafe_code)]
32#![deny(clippy::all, clippy::pedantic)]
33#![allow(
34    clippy::cast_precision_loss,
35    clippy::cast_possible_truncation,
36    clippy::cast_sign_loss,
37    clippy::similar_names,
38    clippy::doc_markdown,
39    clippy::must_use_candidate,
40    clippy::needless_pass_by_value,
41    clippy::unreadable_literal,
42    clippy::missing_panics_doc,
43    clippy::missing_errors_doc,
44    clippy::doc_overindented_list_items,
45    clippy::ptr_arg
46)]
47
48pub mod bootstrap_backend;
49pub mod complexity;
50pub mod cssr;
51pub mod quick_screen;
52pub mod symbolize;
53
54// Re-export WordSymbolizer for convenience
55pub use symbolize::WordSymbolizer;
56
57use complexity::{bootstrap_ci, compute_metrics, stationary_distribution};
58use cssr::run_cssr;
59use pacr_types::CognitiveSplit;
60
61// ── Public API types ──────────────────────────────────────────────────────────
62
63/// Configuration for the ε-machine inference.
64#[derive(Debug, Clone)]
65pub struct Config {
66    /// Maximum history length L (deeper = more expressive, slower, more memory).
67    pub max_depth: usize,
68    /// KS significance level α.  Lower → fewer false splits.
69    pub alpha: f64,
70    /// Bootstrap replicates B for CI estimation.
71    pub bootstrap_b: usize,
72    /// Size of the discrete alphabet |A|.  Symbols must be `0..alphabet_size`.
73    pub alphabet_size: usize,
74}
75
76impl Default for Config {
77    fn default() -> Self {
78        Self {
79            max_depth: 4,
80            alpha: 0.001,
81            bootstrap_b: 200,
82            alphabet_size: 2,
83        }
84    }
85}
86
87/// Full result of ε-machine inference.
88#[derive(Debug, Clone)]
89pub struct InferResult {
90    /// PACR Γ field: (`S_T` = `C_μ`, `H_T` = `h_μ`) with 95 % bootstrap CIs.
91    pub cognitive_split: CognitiveSplit,
92    /// Number of inferred causal states.
93    pub num_states: usize,
94}
95
96// ── Public API ────────────────────────────────────────────────────────────────
97
98/// Infer the ε-machine from a discrete symbol sequence.
99///
100/// # Arguments
101///
102/// * `symbols` — sequence of discrete symbols in `0..cfg.alphabet_size`.
103///   Values ≥ `alphabet_size` are silently ignored during suffix counting.
104/// * `cfg`     — algorithm configuration.
105///
106/// # Panics
107///
108/// Does not panic; degrades gracefully on empty input (returns zero metrics).
109#[must_use]
110pub fn infer(symbols: &[u8], cfg: Config) -> InferResult {
111    if symbols.is_empty() {
112        return InferResult {
113            cognitive_split: zero_split(),
114            num_states: 1,
115        };
116    }
117    let result = run_cssr(symbols, cfg.alphabet_size, cfg.max_depth, cfg.alpha);
118    let num_states = result.states.len();
119    let (c_mu, h_mu) = bootstrap_ci(&result, symbols, cfg.bootstrap_b);
120    InferResult {
121        cognitive_split: CognitiveSplit {
122            statistical_complexity: c_mu,
123            entropy_rate: h_mu,
124        },
125        num_states,
126    }
127}
128
129/// Infer without bootstrap CI (faster; `Estimate::exact` point estimates only).
130#[must_use]
131pub fn infer_fast(symbols: &[u8], cfg: Config) -> InferResult {
132    if symbols.is_empty() {
133        return InferResult {
134            cognitive_split: zero_split(),
135            num_states: 1,
136        };
137    }
138    let result = run_cssr(symbols, cfg.alphabet_size, cfg.max_depth, cfg.alpha);
139    let pi = stationary_distribution(&result, symbols);
140    let (c_point, h_point) = compute_metrics(&result.states, &pi);
141    let num_states = result.states.len();
142    InferResult {
143        cognitive_split: CognitiveSplit {
144            statistical_complexity: pacr_types::Estimate::exact(c_point),
145            entropy_rate: pacr_types::Estimate::exact(h_point),
146        },
147        num_states,
148    }
149}
150
151fn zero_split() -> CognitiveSplit {
152    CognitiveSplit {
153        statistical_complexity: pacr_types::Estimate::exact(0.0),
154        entropy_rate: pacr_types::Estimate::exact(0.0),
155    }
156}
157
158/// Compute empirical entropy rate of a symbol sequence.
159///
160/// Uses Shannon entropy H(X) = -Σ p(x) log₂ p(x) as a rough estimate of
161/// the entropy rate. This is NOT the true h_μ (which requires causal states),
162/// but serves as a fast approximation for drift detection.
163///
164/// # Arguments
165///
166/// * `symbols` — discrete symbol sequence
167/// * `alphabet_size` — size of the alphabet (symbols must be < alphabet_size)
168///
169/// # Returns
170///
171/// Empirical entropy in bits. Returns 0.0 if the sequence is empty.
172#[must_use]
173pub fn empirical_entropy_rate(symbols: &[u8], alphabet_size: usize) -> f64 {
174    if symbols.is_empty() {
175        return 0.0;
176    }
177
178    let mut counts = vec![0_u64; alphabet_size];
179    for &sym in symbols {
180        if (sym as usize) < alphabet_size {
181            counts[sym as usize] += 1;
182        }
183    }
184
185    let n = symbols.len() as f64;
186    let mut h = 0.0;
187    for &count in &counts {
188        if count > 0 {
189            let p = count as f64 / n;
190            h -= p * p.log2();
191        }
192    }
193    h
194}
195
196// ── Test utilities (compile only in test builds) ──────────────────────────────
197
198#[cfg(test)]
199mod test_utils {
200    /// Xorshift64 RNG — deterministic, zero-allocation, no external deps.
201    pub struct TestRng(u64);
202
203    impl TestRng {
204        pub fn new(seed: u64) -> Self {
205            Self(if seed == 0 {
206                0xdead_beef_cafe_babe
207            } else {
208                seed
209            })
210        }
211
212        pub fn next_u64(&mut self) -> u64 {
213            self.0 ^= self.0 << 13;
214            self.0 ^= self.0 >> 7;
215            self.0 ^= self.0 << 17;
216            self.0
217        }
218
219        pub fn next_f64(&mut self) -> f64 {
220            (self.next_u64() >> 11) as f64 / (1u64 << 53) as f64
221        }
222    }
223
224    /// Generate an Even Process sequence of length `n`.
225    ///
226    /// Two-state Markov chain with symmetric stationary distribution π=[0.5, 0.5].
227    ///
228    /// ```text
229    /// S0 --[0, p=2/3]--> S0      S0 --[1, p=1/3]--> S1
230    /// S1 --[0, p=1/3]--> S0      S1 --[1, p=2/3]--> S1
231    /// ```
232    ///
233    /// Theoretical: C_μ = 1.0 bit (exact), h_μ ≈ 0.9183 bits/sym, 2 states.
234    pub fn gen_even_process(n: usize, seed: u64) -> Vec<u8> {
235        let mut rng = TestRng::new(seed);
236        let mut symbols = Vec::with_capacity(n);
237        let mut state = 0u8;
238        for _ in 0..n {
239            let u = rng.next_f64();
240            let (sym, next) = if state == 0 {
241                if u < 2.0 / 3.0 {
242                    (0u8, 0u8)
243                } else {
244                    (1u8, 1u8)
245                }
246            } else {
247                if u < 1.0 / 3.0 {
248                    (0u8, 0u8)
249                } else {
250                    (1u8, 1u8)
251                }
252            };
253            symbols.push(sym);
254            state = next;
255        }
256        symbols
257    }
258
259    /// Generate a Golden Mean Process sequence of length `n`.
260    ///
261    /// Forbids consecutive 1s (the "Golden Mean shift").
262    ///
263    /// ```text
264    /// SA (last=1): must emit 0 → SB      [deterministic]
265    /// SB (last=0): emit 1 (p=1/2) → SA,  emit 0 (p=1/2) → SB
266    /// ```
267    ///
268    /// Stationary: π_SA = 1/3, π_SB = 2/3.
269    /// Theoretical: C_μ = H([1/3,2/3]) ≈ 0.9183 bits,
270    ///              h_μ ≈ 0.6667 bits/sym (ref spec cites ≈ 0.6792), 2 states.
271    ///
272    /// The reference value 0.6792 is from 文獻B Prompt 5; the theoretical value
273    /// for p=0.5 is 2/3 ≈ 0.6667.  Both are within epsilon=0.05 of each other.
274    pub fn gen_golden_mean(n: usize, seed: u64) -> Vec<u8> {
275        let mut rng = TestRng::new(seed);
276        let mut symbols = Vec::with_capacity(n);
277        let mut state = 1u8; // start in SB (last emitted 0 — no constraint)
278        for _ in 0..n {
279            let u = rng.next_f64();
280            let (sym, next) = if state == 0 {
281                // SA: last was 1 → must emit 0 (forbids consecutive 1s)
282                (0u8, 1u8)
283            } else {
284                // SB: last was 0 → emit 0 or 1 with equal probability
285                if u < 0.5 {
286                    (0u8, 1u8)
287                } else {
288                    (1u8, 0u8)
289                }
290            };
291            symbols.push(sym);
292            state = next;
293        }
294        symbols
295    }
296}
297
298// ── Unit & KAT tests ──────────────────────────────────────────────────────────
299
300#[cfg(test)]
301mod tests {
302    use super::test_utils::{gen_even_process, gen_golden_mean};
303    use super::*;
304    use approx::assert_relative_eq;
305
306    // ── Basic API ─────────────────────────────────────────────────────────────
307
308    #[test]
309    fn infer_empty_does_not_panic() {
310        let result = infer(&[], Config::default());
311        assert_eq!(result.num_states, 1);
312        assert!(result.cognitive_split.statistical_complexity.point >= 0.0);
313    }
314
315    #[test]
316    fn infer_single_symbol_stream() {
317        let symbols = vec![0u8; 1000];
318        let result = infer(
319            &symbols,
320            Config {
321                max_depth: 2,
322                ..Config::default()
323            },
324        );
325        assert_eq!(result.num_states, 1, "constant stream → 1 state");
326        assert!(result.cognitive_split.entropy_rate.point < 0.05);
327    }
328
329    #[test]
330    fn infer_alternating_stream_two_states() {
331        let symbols: Vec<u8> = (0..2000).map(|i| (i % 2) as u8).collect();
332        let cfg = Config {
333            max_depth: 2,
334            alpha: 0.001,
335            ..Config::default()
336        };
337        let result = infer_fast(&symbols, cfg);
338        assert_eq!(result.num_states, 2, "alternating → 2 states");
339    }
340
341    // ── KAT: Even Process ─────────────────────────────────────────────────────
342    //
343    // Ref: 文獻B Prompt 5.
344    // Theoretical: C_μ = 1.0 bit (exact), h_μ ≈ 0.9183 bits/sym, 2 states.
345
346    #[test]
347    fn kat_even_process_state_count() {
348        let seq = gen_even_process(10_000, 42);
349        let cfg = Config {
350            max_depth: 2,
351            alpha: 0.001,
352            bootstrap_b: 200,
353            alphabet_size: 2,
354        };
355        let result = infer_fast(&seq, cfg);
356        assert_eq!(
357            result.num_states, 2,
358            "Even Process must infer exactly 2 states, got {}",
359            result.num_states
360        );
361    }
362
363    #[test]
364    fn kat_even_process_complexity() {
365        let seq = gen_even_process(10_000, 42);
366        let cfg = Config {
367            max_depth: 2,
368            alpha: 0.001,
369            bootstrap_b: 200,
370            alphabet_size: 2,
371        };
372        let result = infer(&seq, cfg);
373        let c = &result.cognitive_split.statistical_complexity;
374        // C_μ = 1.0 bit exact.
375        assert_relative_eq!(c.point, 1.0, epsilon = 0.05);
376        // CI well-formed.
377        assert!(c.lower <= c.point + 1e-9 && c.point <= c.upper + 1e-9);
378    }
379
380    #[test]
381    fn kat_even_process_entropy_rate() {
382        let seq = gen_even_process(10_000, 42);
383        let cfg = Config {
384            max_depth: 2,
385            alpha: 0.001,
386            bootstrap_b: 200,
387            alphabet_size: 2,
388        };
389        let result = infer(&seq, cfg);
390        let h = &result.cognitive_split.entropy_rate;
391        // h_μ ≈ 0.9183 bits/sym (H([1/3, 2/3]) weighted by stationary π).
392        assert_relative_eq!(h.point, 0.9183, epsilon = 0.05);
393        assert!(h.lower <= h.point + 1e-9 && h.point <= h.upper + 1e-9);
394    }
395
396    // ── KAT: Golden Mean Process ──────────────────────────────────────────────
397    //
398    // Ref: 文獻B Prompt 5.  Forbids consecutive 1s.
399    // Theoretical: C_μ ≈ 0.9183 bits, h_μ ≈ 0.6792 bits/sym, 2 states.
400    //
401    // Note: h_μ theoretical for p=1/2 is 2/3 ≈ 0.6667; the reference value
402    // 0.6792 is within epsilon=0.05 of both the estimate and 0.6667.
403
404    #[test]
405    fn kat_golden_mean_state_count() {
406        let seq = gen_golden_mean(10_000, 99);
407        let cfg = Config {
408            max_depth: 2,
409            alpha: 0.001,
410            bootstrap_b: 200,
411            alphabet_size: 2,
412        };
413        let result = infer_fast(&seq, cfg);
414        assert_eq!(
415            result.num_states, 2,
416            "Golden Mean must infer exactly 2 states, got {}",
417            result.num_states
418        );
419    }
420
421    #[test]
422    fn kat_golden_mean_complexity() {
423        let seq = gen_golden_mean(10_000, 99);
424        let cfg = Config {
425            max_depth: 2,
426            alpha: 0.001,
427            bootstrap_b: 200,
428            alphabet_size: 2,
429        };
430        let result = infer(&seq, cfg);
431        let c = &result.cognitive_split.statistical_complexity;
432        // C_μ = H([1/3, 2/3]) ≈ 0.9183 bits.
433        assert_relative_eq!(c.point, 0.9183, epsilon = 0.05);
434        assert!(c.lower <= c.point + 1e-9 && c.point <= c.upper + 1e-9);
435    }
436
437    #[test]
438    fn kat_golden_mean_entropy_rate() {
439        let seq = gen_golden_mean(10_000, 99);
440        let cfg = Config {
441            max_depth: 2,
442            alpha: 0.001,
443            bootstrap_b: 200,
444            alphabet_size: 2,
445        };
446        let result = infer(&seq, cfg);
447        let h = &result.cognitive_split.entropy_rate;
448        // h_μ ≈ 0.6792 bits/sym (文獻B Prompt 5 reference value).
449        assert_relative_eq!(h.point, 0.6792, epsilon = 0.05);
450        assert!(h.lower <= h.point + 1e-9 && h.point <= h.upper + 1e-9);
451    }
452
453    // ── Memory budget (structural: no heap profiler, verifies no OOM) ─────────
454
455    #[test]
456    fn large_sequence_does_not_oom() {
457        // N=50_000, L=4, |A|=4 — exercises the suffix-table memory path.
458        // Estimated peak: ≤ N × L × |A| × 4 B ≈ 3.2 MiB, well under 200 MiB.
459        let seq: Vec<u8> = (0..50_000u64)
460            .map(|i| {
461                (i.wrapping_mul(6364136223846793005)
462                    .wrapping_add(1442695040888963407)
463                    % 4) as u8
464            })
465            .collect();
466        let cfg = Config {
467            max_depth: 4,
468            alpha: 0.001,
469            bootstrap_b: 10,
470            alphabet_size: 4,
471        };
472        let result = infer_fast(&seq, cfg);
473        assert!(result.num_states >= 1);
474        assert!(result.cognitive_split.statistical_complexity.point >= 0.0);
475    }
476}
477
478// ── Property-based tests ──────────────────────────────────────────────────────
479
480#[cfg(test)]
481mod prop_tests {
482    use super::test_utils::{gen_even_process, gen_golden_mean};
483    use super::*;
484    use proptest::prelude::*;
485
486    proptest! {
487        /// C_μ ≥ 0 for all Even Process instances.
488        #[test]
489        fn complexity_always_non_negative(seed in 0u64..u64::MAX, n in 200usize..1000usize) {
490            let seq = gen_even_process(n, seed);
491            let cfg = Config { max_depth: 2, bootstrap_b: 5, ..Config::default() };
492            let r = infer_fast(&seq, cfg);
493            prop_assert!(r.cognitive_split.statistical_complexity.point >= 0.0);
494        }
495
496        /// h_μ ≥ 0 for all Even Process instances.
497        #[test]
498        fn entropy_rate_always_non_negative(seed in 0u64..u64::MAX, n in 200usize..1000usize) {
499            let seq = gen_even_process(n, seed);
500            let cfg = Config { max_depth: 2, bootstrap_b: 5, ..Config::default() };
501            let r = infer_fast(&seq, cfg);
502            prop_assert!(r.cognitive_split.entropy_rate.point >= 0.0);
503        }
504
505        /// num_states ≥ 1 always.
506        #[test]
507        fn at_least_one_state(seed in 0u64..u64::MAX, n in 50usize..500usize) {
508            let seq = gen_golden_mean(n, seed);
509            let cfg = Config { max_depth: 2, bootstrap_b: 5, ..Config::default() };
510            let r = infer_fast(&seq, cfg);
511            prop_assert!(r.num_states >= 1);
512        }
513
514        /// Bootstrap CI bounds are ordered: lower ≤ point ≤ upper.
515        #[test]
516        fn ci_bounds_ordered(seed in 0u64..u64::MAX, n in 500usize..2000usize) {
517            let seq = gen_even_process(n, seed);
518            let cfg = Config { max_depth: 2, bootstrap_b: 20, ..Config::default() };
519            let r = infer(&seq, cfg);
520            let c = &r.cognitive_split.statistical_complexity;
521            let h = &r.cognitive_split.entropy_rate;
522            prop_assert!(c.lower <= c.point, "C lower > point");
523            prop_assert!(c.point <= c.upper, "C point > upper");
524            prop_assert!(h.lower <= h.point, "H lower > point");
525            prop_assert!(h.point <= h.upper, "H point > upper");
526        }
527    }
528}