Skip to main content

mockforge_intelligence/behavioral_cloning/
sequence_learner.rs

1//! Sequence learning from recorded traffic
2//!
3//! This module provides functionality to discover and model multi-step
4//! flows from real traffic using trace correlation.
5
6use crate::behavioral_cloning::types::BehavioralSequence;
7use async_trait::async_trait;
8use mockforge_foundation::Result;
9use std::collections::HashMap;
10
11/// Trait for querying trace data for sequence learning
12#[async_trait]
13pub trait TraceQueryProvider: Send + Sync {
14    /// Get requests grouped by trace_id, ordered by timestamp
15    ///
16    /// Returns a vector of (trace_id, requests) tuples where requests
17    /// are ordered by timestamp within each trace.
18    async fn get_requests_by_trace(
19        &self,
20        min_requests_per_trace: Option<usize>,
21    ) -> Result<Vec<(String, Vec<TraceRequest>)>>;
22}
23
24/// A request in a trace for sequence learning
25#[derive(Debug, Clone)]
26pub struct TraceRequest {
27    /// Request ID
28    pub id: String,
29    /// HTTP method
30    pub method: String,
31    /// Request path
32    pub path: String,
33    /// Timestamp
34    pub timestamp: chrono::DateTime<chrono::Utc>,
35    /// Duration in milliseconds (time until next request in sequence)
36    pub duration_ms: Option<u64>,
37}
38
39/// Sequence learner for discovering behavioral patterns
40pub struct SequenceLearner;
41
42impl SequenceLearner {
43    /// Discover sequences from trace correlation
44    ///
45    /// Uses trace_id/span_id to find correlated request sequences
46    /// and identifies common patterns.
47    ///
48    /// This implementation:
49    /// 1. Queries database for requests grouped by trace_id
50    /// 2. Orders by timestamp within each trace
51    /// 3. Identifies common subsequences (e.g., login → list → detail)
52    /// 4. Calculates transition probabilities between steps
53    /// 5. Returns reusable behavioral sequences
54    pub async fn discover_sequences_from_traces(
55        provider: &dyn TraceQueryProvider,
56        min_frequency: f64,
57        min_requests_per_trace: Option<usize>,
58    ) -> Result<Vec<BehavioralSequence>> {
59        // Query database for requests grouped by trace_id
60        let trace_groups = provider.get_requests_by_trace(min_requests_per_trace).await?;
61
62        if trace_groups.is_empty() {
63            return Ok(Vec::new());
64        }
65
66        // Convert trace groups to sequences of (endpoint, method, delay) tuples
67        let mut sequences: Vec<Vec<(String, String, Option<u64>)>> = Vec::new();
68        let mut trace_ids: Vec<String> = Vec::new();
69
70        for (trace_id, requests) in trace_groups {
71            trace_ids.push(trace_id.clone());
72            let mut sequence: Vec<(String, String, Option<u64>)> = Vec::new();
73
74            // Sort requests by timestamp (should already be sorted, but ensure)
75            let mut sorted_requests = requests;
76            sorted_requests.sort_by_key(|r| r.timestamp);
77
78            // Build sequence with delays between requests
79            for (idx, request) in sorted_requests.iter().enumerate() {
80                let delay = if idx > 0 {
81                    let prev_timestamp = sorted_requests[idx - 1].timestamp;
82                    let duration =
83                        request.timestamp.signed_duration_since(prev_timestamp).num_milliseconds();
84                    if duration > 0 {
85                        Some(duration as u64)
86                    } else {
87                        None
88                    }
89                } else {
90                    None
91                };
92
93                sequence.push((request.path.clone(), request.method.clone(), delay));
94            }
95
96            if !sequence.is_empty() {
97                sequences.push(sequence);
98            }
99        }
100
101        // Use existing learn_sequence_pattern to identify common patterns
102        // This will match sequences and group them, but we need to track which trace_ids
103        // contributed to each learned sequence
104        let learned_sequences = Self::learn_sequence_pattern(&sequences, min_frequency)?;
105
106        // Map learned sequences back to trace IDs by matching patterns
107        let mut result = Vec::new();
108        for mut learned_seq in learned_sequences {
109            // Find all trace IDs that match this learned sequence pattern
110            let mut contributing_traces = Vec::new();
111            for (trace_idx, sequence) in sequences.iter().enumerate() {
112                if trace_idx < trace_ids.len() {
113                    // Check if this sequence matches the learned pattern
114                    if sequence.len() == learned_seq.steps.len() {
115                        let mut matches = true;
116                        for (step_idx, step) in learned_seq.steps.iter().enumerate() {
117                            if step_idx < sequence.len() {
118                                let (path, method, _) = &sequence[step_idx];
119                                if step.endpoint != *path || step.method != *method {
120                                    matches = false;
121                                    break;
122                                }
123                            } else {
124                                matches = false;
125                                break;
126                            }
127                        }
128                        if matches {
129                            contributing_traces.push(trace_ids[trace_idx].clone());
130                        }
131                    }
132                }
133            }
134            learned_seq.learned_from = contributing_traces;
135            result.push(learned_seq);
136        }
137
138        Ok(result)
139    }
140
141    /// Learn sequence pattern from a set of request sequences
142    ///
143    /// Analyzes multiple sequences to extract common patterns
144    /// and calculate confidence scores.
145    ///
146    /// Each sequence is represented as (endpoint, method, delay_ms) tuples.
147    pub fn learn_sequence_pattern(
148        sequences: &[Vec<(String, String, Option<u64>)>], // (endpoint, method, delay)
149        min_frequency: f64,
150    ) -> Result<Vec<BehavioralSequence>> {
151        if sequences.is_empty() {
152            return Ok(Vec::new());
153        }
154
155        // Find common subsequences using longest common subsequence (LCS) approach
156        // For simplicity, we'll find exact matches first, then look for patterns
157
158        // Step 1: Normalize sequences to (method, endpoint) pairs for comparison
159        let normalized: Vec<Vec<(String, String)>> = sequences
160            .iter()
161            .map(|seq| {
162                seq.iter()
163                    .map(|(endpoint, method, _)| (method.clone(), endpoint.clone()))
164                    .collect()
165            })
166            .collect();
167
168        // Step 2: Find exact sequence matches
169        #[allow(clippy::type_complexity)]
170        let mut sequence_counts: HashMap<Vec<(String, String)>, (usize, Vec<usize>)> =
171            HashMap::new();
172        for (idx, seq) in normalized.iter().enumerate() {
173            let entry = sequence_counts.entry(seq.clone()).or_insert_with(|| (0, Vec::new()));
174            entry.0 += 1;
175            entry.1.push(idx);
176        }
177
178        // Step 3: Build BehavioralSequences from frequent patterns
179        let total_sequences = sequences.len() as f64;
180        let mut learned_sequences = Vec::new();
181
182        for (pattern, (count, indices)) in sequence_counts {
183            let frequency = count as f64 / total_sequences;
184            if frequency < min_frequency {
185                continue;
186            }
187
188            // Calculate confidence based on consistency
189            // Higher confidence if the pattern appears consistently with similar delays
190            let mut delays_by_position: Vec<Vec<u64>> = vec![Vec::new(); pattern.len()];
191            for &idx in &indices {
192                let original_seq = &sequences[idx];
193                for (pos, (_, _, delay)) in original_seq.iter().enumerate() {
194                    if let Some(d) = delay {
195                        if pos < delays_by_position.len() {
196                            delays_by_position[pos].push(*d);
197                        }
198                    }
199                }
200            }
201
202            // Calculate average delays and variance for confidence
203            let mut steps = Vec::new();
204            let mut total_variance = 0.0;
205            for (pos, (method, endpoint)) in pattern.iter().enumerate() {
206                let avg_delay =
207                    if pos < delays_by_position.len() && !delays_by_position[pos].is_empty() {
208                        let delays = &delays_by_position[pos];
209                        let avg = delays.iter().sum::<u64>() as f64 / delays.len() as f64;
210                        let variance = delays
211                            .iter()
212                            .map(|&d| {
213                                let diff = d as f64 - avg;
214                                diff * diff
215                            })
216                            .sum::<f64>()
217                            / delays.len() as f64;
218                        total_variance += variance;
219                        Some(avg as u64)
220                    } else {
221                        None
222                    };
223
224                let mut step = crate::behavioral_cloning::types::SequenceStep::new(
225                    endpoint.clone(),
226                    method.clone(),
227                );
228                if let Some(delay) = avg_delay {
229                    step.expected_delay_ms = Some(delay);
230                }
231                // Calculate transition probability (how often this step follows the previous)
232                let transition_prob = if pos == 0 {
233                    1.0 // First step always happens
234                } else {
235                    // Count how many sequences have this step after the previous
236                    let prev_pattern = &pattern[..pos];
237                    let matching_sequences = normalized
238                        .iter()
239                        .filter(|seq| {
240                            seq.len() > pos
241                                && seq[..pos] == *prev_pattern
242                                && seq[pos] == (method.clone(), endpoint.clone())
243                        })
244                        .count();
245                    matching_sequences as f64 / count as f64
246                };
247                step.probability = transition_prob;
248                steps.push(step);
249            }
250
251            // Confidence is based on frequency and consistency (inverse of variance)
252            let avg_variance = total_variance / pattern.len() as f64;
253            let consistency_score = 1.0 / (1.0 + avg_variance / 1000.0); // Normalize variance
254            let confidence = frequency * 0.7 + consistency_score * 0.3;
255
256            // Generate sequence ID and name
257            let sequence_id = format!(
258                "seq_{}_{}",
259                pattern[0].0.to_lowercase(),
260                pattern[0].1.replace('/', "_").replace(['{', '}'], "")
261            );
262            let sequence_name =
263                format!("{} {} → {} steps", pattern[0].0, pattern[0].1, pattern.len());
264
265            let learned_from: Vec<String> =
266                indices.iter().map(|&idx| format!("trace_{}", idx)).collect();
267
268            let sequence = BehavioralSequence::new(sequence_id, sequence_name)
269                .with_frequency(frequency)
270                .with_confidence(confidence)
271                .with_learned_from(learned_from);
272
273            // Add steps
274            let mut seq_with_steps = sequence;
275            for step in steps {
276                seq_with_steps = seq_with_steps.add_step(step);
277            }
278
279            learned_sequences.push(seq_with_steps);
280        }
281
282        // Sort by frequency and confidence
283        learned_sequences.sort_by(|a, b| {
284            b.frequency
285                .partial_cmp(&a.frequency)
286                .unwrap_or(std::cmp::Ordering::Equal)
287                .then_with(|| {
288                    b.confidence.partial_cmp(&a.confidence).unwrap_or(std::cmp::Ordering::Equal)
289                })
290        });
291
292        Ok(learned_sequences)
293    }
294
295    // `generate_sequence_scenario` lives in mockforge-http (the only caller)
296    // — keeping ScenarioDefinition out of this crate is what lets
297    // mockforge-intelligence stop depending on mockforge-core and lets
298    // mockforge-core depend on mockforge-intelligence (see Issue #562).
299
300    /// Check if an incoming request matches a learned sequence
301    ///
302    /// Returns the matching sequence and current step index if found.
303    pub fn match_sequence<'a>(
304        sequences: &'a [BehavioralSequence],
305        endpoint: &str,
306        method: &str,
307        conditions: Option<&HashMap<String, String>>,
308    ) -> Option<(&'a BehavioralSequence, usize)> {
309        // Find sequences that start with this endpoint/method
310        for sequence in sequences {
311            if let Some(first_step) = sequence.steps.first() {
312                // Check if endpoint and method match
313                if first_step.endpoint == endpoint && first_step.method == method {
314                    // Check conditions if provided
315                    if let Some(conditions_map) = conditions {
316                        let mut matches = true;
317                        for (key, value) in &first_step.conditions {
318                            if let Some(actual_value) = conditions_map.get(key) {
319                                if actual_value != value {
320                                    matches = false;
321                                    break;
322                                }
323                            } else {
324                                matches = false;
325                                break;
326                            }
327                        }
328                        if !matches {
329                            continue;
330                        }
331                    } else if !first_step.conditions.is_empty() {
332                        // Sequence requires conditions but none provided
333                        continue;
334                    }
335
336                    // Found a match - return sequence with step index 0
337                    return Some((sequence, 0));
338                }
339            }
340        }
341
342        None
343    }
344
345    /// Find the next step in a sequence given the current step
346    ///
347    /// Returns the next step index if found, or None if sequence is complete.
348    pub fn find_next_step(
349        sequence: &BehavioralSequence,
350        current_step_idx: usize,
351        endpoint: &str,
352        method: &str,
353    ) -> Option<usize> {
354        if current_step_idx + 1 >= sequence.steps.len() {
355            return None; // Sequence complete
356        }
357
358        let next_step = &sequence.steps[current_step_idx + 1];
359        if next_step.endpoint == endpoint && next_step.method == method {
360            Some(current_step_idx + 1)
361        } else {
362            None
363        }
364    }
365}