Skip to main content

mockforge_intelligence/behavioral_cloning/
sequence_learner.rs

1//! Sequence learning from recorded traffic
2//!
3//! This module provides functionality to discover and model multi-step
4//! flows from real traffic using trace correlation.
5
6use crate::behavioral_cloning::types::BehavioralSequence;
7use async_trait::async_trait;
8use mockforge_core::scenarios::ScenarioDefinition;
9use mockforge_core::Result;
10use std::collections::HashMap;
11
12/// Trait for querying trace data for sequence learning
13#[async_trait]
14pub trait TraceQueryProvider: Send + Sync {
15    /// Get requests grouped by trace_id, ordered by timestamp
16    ///
17    /// Returns a vector of (trace_id, requests) tuples where requests
18    /// are ordered by timestamp within each trace.
19    async fn get_requests_by_trace(
20        &self,
21        min_requests_per_trace: Option<usize>,
22    ) -> Result<Vec<(String, Vec<TraceRequest>)>>;
23}
24
25/// A request in a trace for sequence learning
26#[derive(Debug, Clone)]
27pub struct TraceRequest {
28    /// Request ID
29    pub id: String,
30    /// HTTP method
31    pub method: String,
32    /// Request path
33    pub path: String,
34    /// Timestamp
35    pub timestamp: chrono::DateTime<chrono::Utc>,
36    /// Duration in milliseconds (time until next request in sequence)
37    pub duration_ms: Option<u64>,
38}
39
40/// Sequence learner for discovering behavioral patterns
41pub struct SequenceLearner;
42
43impl SequenceLearner {
44    /// Discover sequences from trace correlation
45    ///
46    /// Uses trace_id/span_id to find correlated request sequences
47    /// and identifies common patterns.
48    ///
49    /// This implementation:
50    /// 1. Queries database for requests grouped by trace_id
51    /// 2. Orders by timestamp within each trace
52    /// 3. Identifies common subsequences (e.g., login → list → detail)
53    /// 4. Calculates transition probabilities between steps
54    /// 5. Returns reusable behavioral sequences
55    pub async fn discover_sequences_from_traces(
56        provider: &dyn TraceQueryProvider,
57        min_frequency: f64,
58        min_requests_per_trace: Option<usize>,
59    ) -> Result<Vec<BehavioralSequence>> {
60        // Query database for requests grouped by trace_id
61        let trace_groups = provider.get_requests_by_trace(min_requests_per_trace).await?;
62
63        if trace_groups.is_empty() {
64            return Ok(Vec::new());
65        }
66
67        // Convert trace groups to sequences of (endpoint, method, delay) tuples
68        let mut sequences: Vec<Vec<(String, String, Option<u64>)>> = Vec::new();
69        let mut trace_ids: Vec<String> = Vec::new();
70
71        for (trace_id, requests) in trace_groups {
72            trace_ids.push(trace_id.clone());
73            let mut sequence: Vec<(String, String, Option<u64>)> = Vec::new();
74
75            // Sort requests by timestamp (should already be sorted, but ensure)
76            let mut sorted_requests = requests;
77            sorted_requests.sort_by_key(|r| r.timestamp);
78
79            // Build sequence with delays between requests
80            for (idx, request) in sorted_requests.iter().enumerate() {
81                let delay = if idx > 0 {
82                    let prev_timestamp = sorted_requests[idx - 1].timestamp;
83                    let duration =
84                        request.timestamp.signed_duration_since(prev_timestamp).num_milliseconds();
85                    if duration > 0 {
86                        Some(duration as u64)
87                    } else {
88                        None
89                    }
90                } else {
91                    None
92                };
93
94                sequence.push((request.path.clone(), request.method.clone(), delay));
95            }
96
97            if !sequence.is_empty() {
98                sequences.push(sequence);
99            }
100        }
101
102        // Use existing learn_sequence_pattern to identify common patterns
103        // This will match sequences and group them, but we need to track which trace_ids
104        // contributed to each learned sequence
105        let learned_sequences = Self::learn_sequence_pattern(&sequences, min_frequency)?;
106
107        // Map learned sequences back to trace IDs by matching patterns
108        let mut result = Vec::new();
109        for mut learned_seq in learned_sequences {
110            // Find all trace IDs that match this learned sequence pattern
111            let mut contributing_traces = Vec::new();
112            for (trace_idx, sequence) in sequences.iter().enumerate() {
113                if trace_idx < trace_ids.len() {
114                    // Check if this sequence matches the learned pattern
115                    if sequence.len() == learned_seq.steps.len() {
116                        let mut matches = true;
117                        for (step_idx, step) in learned_seq.steps.iter().enumerate() {
118                            if step_idx < sequence.len() {
119                                let (path, method, _) = &sequence[step_idx];
120                                if step.endpoint != *path || step.method != *method {
121                                    matches = false;
122                                    break;
123                                }
124                            } else {
125                                matches = false;
126                                break;
127                            }
128                        }
129                        if matches {
130                            contributing_traces.push(trace_ids[trace_idx].clone());
131                        }
132                    }
133                }
134            }
135            learned_seq.learned_from = contributing_traces;
136            result.push(learned_seq);
137        }
138
139        Ok(result)
140    }
141
142    /// Learn sequence pattern from a set of request sequences
143    ///
144    /// Analyzes multiple sequences to extract common patterns
145    /// and calculate confidence scores.
146    ///
147    /// Each sequence is represented as (endpoint, method, delay_ms) tuples.
148    pub fn learn_sequence_pattern(
149        sequences: &[Vec<(String, String, Option<u64>)>], // (endpoint, method, delay)
150        min_frequency: f64,
151    ) -> Result<Vec<BehavioralSequence>> {
152        if sequences.is_empty() {
153            return Ok(Vec::new());
154        }
155
156        // Find common subsequences using longest common subsequence (LCS) approach
157        // For simplicity, we'll find exact matches first, then look for patterns
158
159        // Step 1: Normalize sequences to (method, endpoint) pairs for comparison
160        let normalized: Vec<Vec<(String, String)>> = sequences
161            .iter()
162            .map(|seq| {
163                seq.iter()
164                    .map(|(endpoint, method, _)| (method.clone(), endpoint.clone()))
165                    .collect()
166            })
167            .collect();
168
169        // Step 2: Find exact sequence matches
170        #[allow(clippy::type_complexity)]
171        let mut sequence_counts: HashMap<Vec<(String, String)>, (usize, Vec<usize>)> =
172            HashMap::new();
173        for (idx, seq) in normalized.iter().enumerate() {
174            let entry = sequence_counts.entry(seq.clone()).or_insert_with(|| (0, Vec::new()));
175            entry.0 += 1;
176            entry.1.push(idx);
177        }
178
179        // Step 3: Build BehavioralSequences from frequent patterns
180        let total_sequences = sequences.len() as f64;
181        let mut learned_sequences = Vec::new();
182
183        for (pattern, (count, indices)) in sequence_counts {
184            let frequency = count as f64 / total_sequences;
185            if frequency < min_frequency {
186                continue;
187            }
188
189            // Calculate confidence based on consistency
190            // Higher confidence if the pattern appears consistently with similar delays
191            let mut delays_by_position: Vec<Vec<u64>> = vec![Vec::new(); pattern.len()];
192            for &idx in &indices {
193                let original_seq = &sequences[idx];
194                for (pos, (_, _, delay)) in original_seq.iter().enumerate() {
195                    if let Some(d) = delay {
196                        if pos < delays_by_position.len() {
197                            delays_by_position[pos].push(*d);
198                        }
199                    }
200                }
201            }
202
203            // Calculate average delays and variance for confidence
204            let mut steps = Vec::new();
205            let mut total_variance = 0.0;
206            for (pos, (method, endpoint)) in pattern.iter().enumerate() {
207                let avg_delay =
208                    if pos < delays_by_position.len() && !delays_by_position[pos].is_empty() {
209                        let delays = &delays_by_position[pos];
210                        let avg = delays.iter().sum::<u64>() as f64 / delays.len() as f64;
211                        let variance = delays
212                            .iter()
213                            .map(|&d| {
214                                let diff = d as f64 - avg;
215                                diff * diff
216                            })
217                            .sum::<f64>()
218                            / delays.len() as f64;
219                        total_variance += variance;
220                        Some(avg as u64)
221                    } else {
222                        None
223                    };
224
225                let mut step = crate::behavioral_cloning::types::SequenceStep::new(
226                    endpoint.clone(),
227                    method.clone(),
228                );
229                if let Some(delay) = avg_delay {
230                    step.expected_delay_ms = Some(delay);
231                }
232                // Calculate transition probability (how often this step follows the previous)
233                let transition_prob = if pos == 0 {
234                    1.0 // First step always happens
235                } else {
236                    // Count how many sequences have this step after the previous
237                    let prev_pattern = &pattern[..pos];
238                    let matching_sequences = normalized
239                        .iter()
240                        .filter(|seq| {
241                            seq.len() > pos
242                                && seq[..pos] == *prev_pattern
243                                && seq[pos] == (method.clone(), endpoint.clone())
244                        })
245                        .count();
246                    matching_sequences as f64 / count as f64
247                };
248                step.probability = transition_prob;
249                steps.push(step);
250            }
251
252            // Confidence is based on frequency and consistency (inverse of variance)
253            let avg_variance = total_variance / pattern.len() as f64;
254            let consistency_score = 1.0 / (1.0 + avg_variance / 1000.0); // Normalize variance
255            let confidence = frequency * 0.7 + consistency_score * 0.3;
256
257            // Generate sequence ID and name
258            let sequence_id = format!(
259                "seq_{}_{}",
260                pattern[0].0.to_lowercase(),
261                pattern[0].1.replace('/', "_").replace(['{', '}'], "")
262            );
263            let sequence_name =
264                format!("{} {} → {} steps", pattern[0].0, pattern[0].1, pattern.len());
265
266            let learned_from: Vec<String> =
267                indices.iter().map(|&idx| format!("trace_{}", idx)).collect();
268
269            let sequence = BehavioralSequence::new(sequence_id, sequence_name)
270                .with_frequency(frequency)
271                .with_confidence(confidence)
272                .with_learned_from(learned_from);
273
274            // Add steps
275            let mut seq_with_steps = sequence;
276            for step in steps {
277                seq_with_steps = seq_with_steps.add_step(step);
278            }
279
280            learned_sequences.push(seq_with_steps);
281        }
282
283        // Sort by frequency and confidence
284        learned_sequences.sort_by(|a, b| {
285            b.frequency
286                .partial_cmp(&a.frequency)
287                .unwrap_or(std::cmp::Ordering::Equal)
288                .then_with(|| {
289                    b.confidence.partial_cmp(&a.confidence).unwrap_or(std::cmp::Ordering::Equal)
290                })
291        });
292
293        Ok(learned_sequences)
294    }
295
296    /// Generate a scenario definition from a learned sequence
297    ///
298    /// Converts a BehavioralSequence into a ScenarioDefinition
299    /// that can be executed by the ScenarioExecutor.
300    pub fn generate_sequence_scenario(sequence: &BehavioralSequence) -> ScenarioDefinition {
301        use mockforge_core::scenarios::ScenarioStep;
302
303        let mut scenario =
304            ScenarioDefinition::new(&sequence.id, &sequence.name).with_tags(sequence.tags.clone());
305
306        if let Some(description) = &sequence.description {
307            scenario.description = Some(description.clone());
308        }
309
310        // Convert sequence steps to scenario steps
311        for (idx, step) in sequence.steps.iter().enumerate() {
312            let mut scenario_step = ScenarioStep::new(
313                format!("step_{}", idx),
314                step.name.as_deref().unwrap_or(&step.endpoint),
315                &step.method,
316                &step.endpoint,
317            )
318            .expect_status(200); // Default, could be learned
319
320            if let Some(delay) = step.expected_delay_ms {
321                scenario_step.delay_ms = Some(delay);
322            }
323
324            // Add conditions as query params if needed
325            for (key, value) in &step.conditions {
326                scenario_step.query_params.insert(key.clone(), value.clone());
327            }
328
329            scenario = scenario.add_step(scenario_step);
330        }
331
332        scenario
333    }
334
335    /// Check if an incoming request matches a learned sequence
336    ///
337    /// Returns the matching sequence and current step index if found.
338    pub fn match_sequence<'a>(
339        sequences: &'a [BehavioralSequence],
340        endpoint: &str,
341        method: &str,
342        conditions: Option<&HashMap<String, String>>,
343    ) -> Option<(&'a BehavioralSequence, usize)> {
344        // Find sequences that start with this endpoint/method
345        for sequence in sequences {
346            if let Some(first_step) = sequence.steps.first() {
347                // Check if endpoint and method match
348                if first_step.endpoint == endpoint && first_step.method == method {
349                    // Check conditions if provided
350                    if let Some(conditions_map) = conditions {
351                        let mut matches = true;
352                        for (key, value) in &first_step.conditions {
353                            if let Some(actual_value) = conditions_map.get(key) {
354                                if actual_value != value {
355                                    matches = false;
356                                    break;
357                                }
358                            } else {
359                                matches = false;
360                                break;
361                            }
362                        }
363                        if !matches {
364                            continue;
365                        }
366                    } else if !first_step.conditions.is_empty() {
367                        // Sequence requires conditions but none provided
368                        continue;
369                    }
370
371                    // Found a match - return sequence with step index 0
372                    return Some((sequence, 0));
373                }
374            }
375        }
376
377        None
378    }
379
380    /// Find the next step in a sequence given the current step
381    ///
382    /// Returns the next step index if found, or None if sequence is complete.
383    pub fn find_next_step(
384        sequence: &BehavioralSequence,
385        current_step_idx: usize,
386        endpoint: &str,
387        method: &str,
388    ) -> Option<usize> {
389        if current_step_idx + 1 >= sequence.steps.len() {
390            return None; // Sequence complete
391        }
392
393        let next_step = &sequence.steps[current_step_idx + 1];
394        if next_step.endpoint == endpoint && next_step.method == method {
395            Some(current_step_idx + 1)
396        } else {
397            None
398        }
399    }
400}