mockforge_intelligence/behavioral_cloning/sequence_learner.rs
1//! Sequence learning from recorded traffic
2//!
3//! This module provides functionality to discover and model multi-step
4//! flows from real traffic using trace correlation.
5
6use crate::behavioral_cloning::types::BehavioralSequence;
7use async_trait::async_trait;
8use mockforge_foundation::Result;
9use std::collections::HashMap;
10
11/// Trait for querying trace data for sequence learning
12#[async_trait]
13pub trait TraceQueryProvider: Send + Sync {
14 /// Get requests grouped by trace_id, ordered by timestamp
15 ///
16 /// Returns a vector of (trace_id, requests) tuples where requests
17 /// are ordered by timestamp within each trace.
18 async fn get_requests_by_trace(
19 &self,
20 min_requests_per_trace: Option<usize>,
21 ) -> Result<Vec<(String, Vec<TraceRequest>)>>;
22}
23
24/// A request in a trace for sequence learning
25#[derive(Debug, Clone)]
26pub struct TraceRequest {
27 /// Request ID
28 pub id: String,
29 /// HTTP method
30 pub method: String,
31 /// Request path
32 pub path: String,
33 /// Timestamp
34 pub timestamp: chrono::DateTime<chrono::Utc>,
35 /// Duration in milliseconds (time until next request in sequence)
36 pub duration_ms: Option<u64>,
37}
38
39/// Sequence learner for discovering behavioral patterns
40pub struct SequenceLearner;
41
42impl SequenceLearner {
43 /// Discover sequences from trace correlation
44 ///
45 /// Uses trace_id/span_id to find correlated request sequences
46 /// and identifies common patterns.
47 ///
48 /// This implementation:
49 /// 1. Queries database for requests grouped by trace_id
50 /// 2. Orders by timestamp within each trace
51 /// 3. Identifies common subsequences (e.g., login → list → detail)
52 /// 4. Calculates transition probabilities between steps
53 /// 5. Returns reusable behavioral sequences
54 pub async fn discover_sequences_from_traces(
55 provider: &dyn TraceQueryProvider,
56 min_frequency: f64,
57 min_requests_per_trace: Option<usize>,
58 ) -> Result<Vec<BehavioralSequence>> {
59 // Query database for requests grouped by trace_id
60 let trace_groups = provider.get_requests_by_trace(min_requests_per_trace).await?;
61
62 if trace_groups.is_empty() {
63 return Ok(Vec::new());
64 }
65
66 // Convert trace groups to sequences of (endpoint, method, delay) tuples
67 let mut sequences: Vec<Vec<(String, String, Option<u64>)>> = Vec::new();
68 let mut trace_ids: Vec<String> = Vec::new();
69
70 for (trace_id, requests) in trace_groups {
71 trace_ids.push(trace_id.clone());
72 let mut sequence: Vec<(String, String, Option<u64>)> = Vec::new();
73
74 // Sort requests by timestamp (should already be sorted, but ensure)
75 let mut sorted_requests = requests;
76 sorted_requests.sort_by_key(|r| r.timestamp);
77
78 // Build sequence with delays between requests
79 for (idx, request) in sorted_requests.iter().enumerate() {
80 let delay = if idx > 0 {
81 let prev_timestamp = sorted_requests[idx - 1].timestamp;
82 let duration =
83 request.timestamp.signed_duration_since(prev_timestamp).num_milliseconds();
84 if duration > 0 {
85 Some(duration as u64)
86 } else {
87 None
88 }
89 } else {
90 None
91 };
92
93 sequence.push((request.path.clone(), request.method.clone(), delay));
94 }
95
96 if !sequence.is_empty() {
97 sequences.push(sequence);
98 }
99 }
100
101 // Use existing learn_sequence_pattern to identify common patterns
102 // This will match sequences and group them, but we need to track which trace_ids
103 // contributed to each learned sequence
104 let learned_sequences = Self::learn_sequence_pattern(&sequences, min_frequency)?;
105
106 // Map learned sequences back to trace IDs by matching patterns
107 let mut result = Vec::new();
108 for mut learned_seq in learned_sequences {
109 // Find all trace IDs that match this learned sequence pattern
110 let mut contributing_traces = Vec::new();
111 for (trace_idx, sequence) in sequences.iter().enumerate() {
112 if trace_idx < trace_ids.len() {
113 // Check if this sequence matches the learned pattern
114 if sequence.len() == learned_seq.steps.len() {
115 let mut matches = true;
116 for (step_idx, step) in learned_seq.steps.iter().enumerate() {
117 if step_idx < sequence.len() {
118 let (path, method, _) = &sequence[step_idx];
119 if step.endpoint != *path || step.method != *method {
120 matches = false;
121 break;
122 }
123 } else {
124 matches = false;
125 break;
126 }
127 }
128 if matches {
129 contributing_traces.push(trace_ids[trace_idx].clone());
130 }
131 }
132 }
133 }
134 learned_seq.learned_from = contributing_traces;
135 result.push(learned_seq);
136 }
137
138 Ok(result)
139 }
140
141 /// Learn sequence pattern from a set of request sequences
142 ///
143 /// Analyzes multiple sequences to extract common patterns
144 /// and calculate confidence scores.
145 ///
146 /// Each sequence is represented as (endpoint, method, delay_ms) tuples.
147 pub fn learn_sequence_pattern(
148 sequences: &[Vec<(String, String, Option<u64>)>], // (endpoint, method, delay)
149 min_frequency: f64,
150 ) -> Result<Vec<BehavioralSequence>> {
151 if sequences.is_empty() {
152 return Ok(Vec::new());
153 }
154
155 // Find common subsequences using longest common subsequence (LCS) approach
156 // For simplicity, we'll find exact matches first, then look for patterns
157
158 // Step 1: Normalize sequences to (method, endpoint) pairs for comparison
159 let normalized: Vec<Vec<(String, String)>> = sequences
160 .iter()
161 .map(|seq| {
162 seq.iter()
163 .map(|(endpoint, method, _)| (method.clone(), endpoint.clone()))
164 .collect()
165 })
166 .collect();
167
168 // Step 2: Find exact sequence matches
169 #[allow(clippy::type_complexity)]
170 let mut sequence_counts: HashMap<Vec<(String, String)>, (usize, Vec<usize>)> =
171 HashMap::new();
172 for (idx, seq) in normalized.iter().enumerate() {
173 let entry = sequence_counts.entry(seq.clone()).or_insert_with(|| (0, Vec::new()));
174 entry.0 += 1;
175 entry.1.push(idx);
176 }
177
178 // Step 3: Build BehavioralSequences from frequent patterns
179 let total_sequences = sequences.len() as f64;
180 let mut learned_sequences = Vec::new();
181
182 for (pattern, (count, indices)) in sequence_counts {
183 let frequency = count as f64 / total_sequences;
184 if frequency < min_frequency {
185 continue;
186 }
187
188 // Calculate confidence based on consistency
189 // Higher confidence if the pattern appears consistently with similar delays
190 let mut delays_by_position: Vec<Vec<u64>> = vec![Vec::new(); pattern.len()];
191 for &idx in &indices {
192 let original_seq = &sequences[idx];
193 for (pos, (_, _, delay)) in original_seq.iter().enumerate() {
194 if let Some(d) = delay {
195 if pos < delays_by_position.len() {
196 delays_by_position[pos].push(*d);
197 }
198 }
199 }
200 }
201
202 // Calculate average delays and variance for confidence
203 let mut steps = Vec::new();
204 let mut total_variance = 0.0;
205 for (pos, (method, endpoint)) in pattern.iter().enumerate() {
206 let avg_delay =
207 if pos < delays_by_position.len() && !delays_by_position[pos].is_empty() {
208 let delays = &delays_by_position[pos];
209 let avg = delays.iter().sum::<u64>() as f64 / delays.len() as f64;
210 let variance = delays
211 .iter()
212 .map(|&d| {
213 let diff = d as f64 - avg;
214 diff * diff
215 })
216 .sum::<f64>()
217 / delays.len() as f64;
218 total_variance += variance;
219 Some(avg as u64)
220 } else {
221 None
222 };
223
224 let mut step = crate::behavioral_cloning::types::SequenceStep::new(
225 endpoint.clone(),
226 method.clone(),
227 );
228 if let Some(delay) = avg_delay {
229 step.expected_delay_ms = Some(delay);
230 }
231 // Calculate transition probability (how often this step follows the previous)
232 let transition_prob = if pos == 0 {
233 1.0 // First step always happens
234 } else {
235 // Count how many sequences have this step after the previous
236 let prev_pattern = &pattern[..pos];
237 let matching_sequences = normalized
238 .iter()
239 .filter(|seq| {
240 seq.len() > pos
241 && seq[..pos] == *prev_pattern
242 && seq[pos] == (method.clone(), endpoint.clone())
243 })
244 .count();
245 matching_sequences as f64 / count as f64
246 };
247 step.probability = transition_prob;
248 steps.push(step);
249 }
250
251 // Confidence is based on frequency and consistency (inverse of variance)
252 let avg_variance = total_variance / pattern.len() as f64;
253 let consistency_score = 1.0 / (1.0 + avg_variance / 1000.0); // Normalize variance
254 let confidence = frequency * 0.7 + consistency_score * 0.3;
255
256 // Generate sequence ID and name
257 let sequence_id = format!(
258 "seq_{}_{}",
259 pattern[0].0.to_lowercase(),
260 pattern[0].1.replace('/', "_").replace(['{', '}'], "")
261 );
262 let sequence_name =
263 format!("{} {} → {} steps", pattern[0].0, pattern[0].1, pattern.len());
264
265 let learned_from: Vec<String> =
266 indices.iter().map(|&idx| format!("trace_{}", idx)).collect();
267
268 let sequence = BehavioralSequence::new(sequence_id, sequence_name)
269 .with_frequency(frequency)
270 .with_confidence(confidence)
271 .with_learned_from(learned_from);
272
273 // Add steps
274 let mut seq_with_steps = sequence;
275 for step in steps {
276 seq_with_steps = seq_with_steps.add_step(step);
277 }
278
279 learned_sequences.push(seq_with_steps);
280 }
281
282 // Sort by frequency and confidence
283 learned_sequences.sort_by(|a, b| {
284 b.frequency
285 .partial_cmp(&a.frequency)
286 .unwrap_or(std::cmp::Ordering::Equal)
287 .then_with(|| {
288 b.confidence.partial_cmp(&a.confidence).unwrap_or(std::cmp::Ordering::Equal)
289 })
290 });
291
292 Ok(learned_sequences)
293 }
294
295 // `generate_sequence_scenario` lives in mockforge-http (the only caller)
296 // — keeping ScenarioDefinition out of this crate is what lets
297 // mockforge-intelligence stop depending on mockforge-core and lets
298 // mockforge-core depend on mockforge-intelligence (see Issue #562).
299
300 /// Check if an incoming request matches a learned sequence
301 ///
302 /// Returns the matching sequence and current step index if found.
303 pub fn match_sequence<'a>(
304 sequences: &'a [BehavioralSequence],
305 endpoint: &str,
306 method: &str,
307 conditions: Option<&HashMap<String, String>>,
308 ) -> Option<(&'a BehavioralSequence, usize)> {
309 // Find sequences that start with this endpoint/method
310 for sequence in sequences {
311 if let Some(first_step) = sequence.steps.first() {
312 // Check if endpoint and method match
313 if first_step.endpoint == endpoint && first_step.method == method {
314 // Check conditions if provided
315 if let Some(conditions_map) = conditions {
316 let mut matches = true;
317 for (key, value) in &first_step.conditions {
318 if let Some(actual_value) = conditions_map.get(key) {
319 if actual_value != value {
320 matches = false;
321 break;
322 }
323 } else {
324 matches = false;
325 break;
326 }
327 }
328 if !matches {
329 continue;
330 }
331 } else if !first_step.conditions.is_empty() {
332 // Sequence requires conditions but none provided
333 continue;
334 }
335
336 // Found a match - return sequence with step index 0
337 return Some((sequence, 0));
338 }
339 }
340 }
341
342 None
343 }
344
345 /// Find the next step in a sequence given the current step
346 ///
347 /// Returns the next step index if found, or None if sequence is complete.
348 pub fn find_next_step(
349 sequence: &BehavioralSequence,
350 current_step_idx: usize,
351 endpoint: &str,
352 method: &str,
353 ) -> Option<usize> {
354 if current_step_idx + 1 >= sequence.steps.len() {
355 return None; // Sequence complete
356 }
357
358 let next_step = &sequence.steps[current_step_idx + 1];
359 if next_step.endpoint == endpoint && next_step.method == method {
360 Some(current_step_idx + 1)
361 } else {
362 None
363 }
364 }
365}