anomaly_grid/markov_model/
mod.rs

1//! Markov Model module for variable-order Markov chain implementation
2//!
3//! This module provides a variable-order Markov model with hierarchical
4//! context selection and robust probability estimation.
5
6use crate::config::AnomalyGridConfig;
7use crate::context_tree::ContextTree;
8use crate::error::{AnomalyGridError, AnomalyGridResult};
9use std::collections::HashMap;
10
11/// Variable-order Markov model for sequence analysis
12#[derive(Debug, Clone)]
13pub struct MarkovModel {
14    /// The underlying context tree
15    context_tree: ContextTree,
16    /// Mapping from states to unique IDs
17    state_mapping: HashMap<String, usize>,
18    /// Reverse mapping from IDs to states
19    id_to_state: Vec<String>,
20    /// Configuration parameters
21    config: AnomalyGridConfig,
22}
23
24impl MarkovModel {
25    /// Create a new Markov model with specified maximum order
26    pub fn new(max_order: usize) -> AnomalyGridResult<Self> {
27        if max_order == 0 {
28            return Err(AnomalyGridError::invalid_max_order(max_order));
29        }
30
31        let config = AnomalyGridConfig::default().with_max_order(max_order)?;
32
33        Ok(Self {
34            context_tree: ContextTree::new(max_order)?,
35            state_mapping: HashMap::new(),
36            id_to_state: Vec::new(),
37            config,
38        })
39    }
40
41    /// Create a new Markov model with custom configuration
42    pub fn with_config(config: AnomalyGridConfig) -> AnomalyGridResult<Self> {
43        config.validate()?;
44
45        Ok(Self {
46            context_tree: ContextTree::new(config.max_order)?,
47            state_mapping: HashMap::new(),
48            id_to_state: Vec::new(),
49            config,
50        })
51    }
52
53    /// Train the model on a sequence
54    ///
55    /// # Complexity
56    /// - Time: O(n × max_order × |alphabet|) where n = sequence length
57    /// - Space: O(|alphabet|^max_order) in worst case
58    ///
59    /// # Performance Guarantees
60    /// - Memory usage is bounded by config.memory_limit if set
61    /// - Validates sequence length against config.min_sequence_length
62    pub fn train(&mut self, sequence: &[String]) -> AnomalyGridResult<()> {
63        // Validate sequence length
64        if sequence.len() < self.config.min_sequence_length {
65            return Err(AnomalyGridError::sequence_too_short(
66                self.config.min_sequence_length,
67                sequence.len(),
68                "model training",
69            ));
70        }
71
72        // Build state mapping
73        self.build_state_mapping(sequence);
74
75        // Build context tree with configuration
76        self.context_tree
77            .build_from_sequence(sequence, &self.config)
78    }
79
80    /// Calculate the likelihood of a sequence under the model
81    pub fn calculate_likelihood(&self, sequence: &[String]) -> f64 {
82        if sequence.len() < 2 {
83            return 1.0; // Empty or single-element sequences have likelihood 1
84        }
85
86        let mut likelihood = 1.0;
87
88        for i in 1..sequence.len() {
89            let prob = self.get_best_context_probability_for_position(sequence, i);
90            likelihood *= prob;
91        }
92
93        likelihood
94    }
95
96    /// Get the best context probability using hierarchical context selection
97    pub fn get_best_context_probability(&self, context: &[String], next_state: &str) -> f64 {
98        // Try contexts from longest to shortest (hierarchical selection)
99        for context_len in (1..=context.len().min(self.context_tree.max_order)).rev() {
100            let sub_context = &context[context.len() - context_len..];
101
102            if let Some(prob) = self
103                .context_tree
104                .get_transition_probability(sub_context, next_state)
105            {
106                return prob;
107            }
108        }
109
110        // Fallback to background probability for unseen transitions
111        self.get_background_probability(next_state)
112    }
113
114    /// Get the maximum order of the model
115    pub fn max_order(&self) -> usize {
116        self.config.max_order
117    }
118
119    /// Get the configuration
120    pub fn config(&self) -> &AnomalyGridConfig {
121        &self.config
122    }
123
124    /// Get the state mapping
125    pub fn state_mapping(&self) -> &HashMap<String, usize> {
126        &self.state_mapping
127    }
128
129    /// Get the context tree
130    pub fn context_tree(&self) -> &ContextTree {
131        &self.context_tree
132    }
133
134    /// Get mutable access to the context tree (for optimizations)
135    pub fn context_tree_mut(&mut self) -> &mut ContextTree {
136        &mut self.context_tree
137    }
138
139    /// Build state mapping from sequence
140    fn build_state_mapping(&mut self, sequence: &[String]) {
141        let mut unique_states: std::collections::HashSet<String> =
142            sequence.iter().cloned().collect();
143
144        self.state_mapping.clear();
145        self.id_to_state.clear();
146
147        for (id, state) in unique_states.drain().enumerate() {
148            self.state_mapping.insert(state.clone(), id);
149            self.id_to_state.push(state);
150        }
151    }
152
153    /// Get probability for a specific position in a sequence
154    fn get_best_context_probability_for_position(
155        &self,
156        sequence: &[String],
157        position: usize,
158    ) -> f64 {
159        let next_state = &sequence[position];
160        let max_context_len = position.min(self.context_tree.max_order);
161
162        // Try contexts from longest to shortest
163        for context_len in (1..=max_context_len).rev() {
164            let context = &sequence[position - context_len..position];
165
166            if let Some(prob) = self
167                .context_tree
168                .get_transition_probability(context, next_state)
169            {
170                return prob;
171            }
172        }
173
174        // Fallback to background probability
175        self.get_background_probability(next_state)
176    }
177
178    /// Get background probability for unseen transitions
179    fn get_background_probability(&self, state: &str) -> f64 {
180        // If state is known, use uniform probability over known states
181        if self.state_mapping.contains_key(state) {
182            let vocab_size = self.state_mapping.len() as f64;
183            (1.0 / (vocab_size + 1.0)).max(self.config.min_probability)
184        } else {
185            // For completely unknown states, use very small probability
186            self.config.min_probability
187        }
188    }
189}