anomaly_grid/context_tree/
mod.rs

1//! Context Tree module for variable-order Markov model implementation
2//!
3//! This module implements context storage and probability estimation for building
4//! variable-order Markov models with information-theoretic measures.
5
6use crate::config::AnomalyGridConfig;
7use crate::error::{AnomalyGridError, AnomalyGridResult};
8use std::collections::HashMap;
9
10/// A node in the context tree that stores transition statistics
11#[derive(Debug, Clone)]
12pub struct ContextNode {
13    /// Raw transition counts for each next state
14    pub counts: HashMap<String, usize>,
15    /// Normalized transition probabilities
16    pub probabilities: HashMap<String, f64>,
17    /// Shannon entropy of the transition distribution
18    pub entropy: f64,
19    /// KL divergence from uniform distribution
20    pub kl_divergence: f64,
21}
22
23impl ContextNode {
24    /// Create a new empty context node
25    pub fn new() -> Self {
26        Self {
27            counts: HashMap::new(),
28            probabilities: HashMap::new(),
29            entropy: 0.0,
30            kl_divergence: 0.0,
31        }
32    }
33
34    /// Add a transition to this context
35    pub fn add_transition(&mut self, next_state: String) {
36        *self.counts.entry(next_state).or_insert(0) += 1;
37    }
38
39    /// Calculate probabilities and information-theoretic measures
40    pub fn calculate_probabilities(&mut self, config: &AnomalyGridConfig) {
41        if self.counts.is_empty() {
42            return;
43        }
44
45        let total_count: usize = self.counts.values().sum();
46        let vocab_size = self.counts.len();
47
48        // Calculate probabilities with configurable Laplace smoothing
49        self.probabilities.clear();
50
51        for (state, &count) in &self.counts {
52            let smoothed_prob = (count as f64 + config.smoothing_alpha)
53                / (total_count as f64 + config.smoothing_alpha * vocab_size as f64);
54            self.probabilities.insert(state.clone(), smoothed_prob);
55        }
56
57        // Calculate Shannon entropy: H(X) = -∑ P(x) log₂ P(x)
58        // CRITICAL FIX: Correct formula is -p * log2(p), not -p.log2()
59        self.entropy = self
60            .probabilities
61            .values()
62            .map(|&p| if p > 0.0 { -p * p.log2() } else { 0.0 })
63            .sum();
64
65        // Calculate KL divergence from uniform distribution
66        let uniform_prob = 1.0 / vocab_size as f64;
67        self.kl_divergence = self
68            .probabilities
69            .values()
70            .map(|&p| {
71                if p > 0.0 {
72                    p * (p / uniform_prob).log2()
73                } else {
74                    0.0
75                }
76            })
77            .sum();
78    }
79}
80
81impl Default for ContextNode {
82    fn default() -> Self {
83        Self::new()
84    }
85}
86
87/// Context tree for storing variable-order Markov chain contexts
88#[derive(Debug, Clone)]
89pub struct ContextTree {
90    /// Map from context sequences to context nodes
91    pub contexts: HashMap<Vec<String>, ContextNode>,
92    /// Maximum context order (length)
93    pub max_order: usize,
94}
95
96impl ContextTree {
97    /// Create a new context tree with specified maximum order
98    pub fn new(max_order: usize) -> AnomalyGridResult<Self> {
99        if max_order == 0 {
100            return Err(AnomalyGridError::invalid_max_order(max_order));
101        }
102
103        Ok(Self {
104            contexts: HashMap::new(),
105            max_order,
106        })
107    }
108
109    /// Build the context tree from a training sequence
110    ///
111    /// # Complexity
112    /// - Time: O(n × max_order × |alphabet|) where n = sequence length
113    /// - Space: O(|alphabet|^max_order) in worst case
114    ///
115    /// # Performance Guarantees
116    /// - Memory usage is bounded by config.memory_limit if set
117    /// - Processing time scales linearly with sequence length
118    pub fn build_from_sequence(
119        &mut self,
120        sequence: &[String],
121        config: &AnomalyGridConfig,
122    ) -> AnomalyGridResult<()> {
123        // Validate sequence length
124        if sequence.len() < config.min_sequence_length {
125            return Err(AnomalyGridError::sequence_too_short(
126                config.min_sequence_length,
127                sequence.len(),
128                "context tree building",
129            ));
130        }
131
132        // Extract contexts of all orders from 1 to max_order
133        for window_size in 1..=self.max_order {
134            for window in sequence.windows(window_size + 1) {
135                // Check memory limit before adding new context
136                if let Some(limit) = config.memory_limit {
137                    if self.contexts.len() >= limit {
138                        return Err(AnomalyGridError::memory_limit_exceeded(
139                            self.contexts.len(),
140                            limit,
141                        ));
142                    }
143                }
144
145                let context = window[..window_size].to_vec();
146                let next_state = &window[window_size];
147
148                let node = self.contexts.entry(context).or_default();
149                node.add_transition(next_state.clone());
150            }
151        }
152
153        // Calculate probabilities for all contexts
154        for node in self.contexts.values_mut() {
155            node.calculate_probabilities(config);
156        }
157
158        Ok(())
159    }
160
161    /// Get the transition probability for a given context and next state
162    pub fn get_transition_probability(&self, context: &[String], next_state: &str) -> Option<f64> {
163        self.contexts
164            .get(context)
165            .and_then(|node| node.probabilities.get(next_state))
166            .copied()
167    }
168
169    /// Get a context node for the given context
170    pub fn get_context_node(&self, context: &[String]) -> Option<&ContextNode> {
171        self.contexts.get(context)
172    }
173
174    /// Get all contexts of a specific order
175    pub fn get_contexts_of_order(&self, order: usize) -> Vec<&Vec<String>> {
176        self.contexts
177            .keys()
178            .filter(|context| context.len() == order)
179            .collect()
180    }
181
182    /// Get the number of contexts stored
183    pub fn context_count(&self) -> usize {
184        self.contexts.len()
185    }
186}