ruvector_data_framework/
discovery.rs

1//! Discovery engine for detecting novel patterns from coherence signals
2
3use std::collections::HashMap;
4
5use chrono::{DateTime, Utc};
6use serde::{Deserialize, Serialize};
7
8use crate::{CoherenceSignal, Result};
9
10/// Configuration for discovery engine
11#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct DiscoveryConfig {
13    /// Minimum signal strength to consider
14    pub min_signal_strength: f64,
15
16    /// Lookback window for trend analysis
17    pub lookback_windows: usize,
18
19    /// Threshold for detecting emergence
20    pub emergence_threshold: f64,
21
22    /// Threshold for detecting splits
23    pub split_threshold: f64,
24
25    /// Threshold for detecting bridges
26    pub bridge_threshold: f64,
27
28    /// Enable anomaly detection
29    pub detect_anomalies: bool,
30
31    /// Anomaly sensitivity (standard deviations)
32    pub anomaly_sigma: f64,
33}
34
35impl Default for DiscoveryConfig {
36    fn default() -> Self {
37        Self {
38            min_signal_strength: 0.01,
39            lookback_windows: 10,
40            emergence_threshold: 0.2,
41            split_threshold: 0.5,
42            bridge_threshold: 0.3,
43            detect_anomalies: true,
44            anomaly_sigma: 2.5,
45        }
46    }
47}
48
49/// Categories of discoverable patterns
50#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)]
51pub enum PatternCategory {
52    /// New cluster/community emerging
53    Emergence,
54
55    /// Existing structure splitting
56    Split,
57
58    /// Two structures merging
59    Merge,
60
61    /// Cross-domain connection forming
62    Bridge,
63
64    /// Unusual coherence pattern
65    Anomaly,
66
67    /// Gradual strengthening
68    Consolidation,
69
70    /// Gradual weakening
71    Dissolution,
72
73    /// Cyclical pattern detected
74    Cyclical,
75}
76
77/// Strength of discovered pattern
78#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Ord, PartialOrd)]
79pub enum PatternStrength {
80    /// Weak signal, might be noise
81    Weak,
82
83    /// Moderate signal, worth monitoring
84    Moderate,
85
86    /// Strong signal, likely real
87    Strong,
88
89    /// Very strong signal, high confidence
90    VeryStrong,
91}
92
93impl PatternStrength {
94    /// Convert from numeric score
95    pub fn from_score(score: f64) -> Self {
96        if score < 0.25 {
97            PatternStrength::Weak
98        } else if score < 0.5 {
99            PatternStrength::Moderate
100        } else if score < 0.75 {
101            PatternStrength::Strong
102        } else {
103            PatternStrength::VeryStrong
104        }
105    }
106}
107
108/// A discovered pattern
109#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct DiscoveryPattern {
111    /// Unique pattern identifier
112    pub id: String,
113
114    /// Pattern category
115    pub category: PatternCategory,
116
117    /// Pattern strength
118    pub strength: PatternStrength,
119
120    /// Numeric confidence score (0-1)
121    pub confidence: f64,
122
123    /// When pattern was first detected
124    pub detected_at: DateTime<Utc>,
125
126    /// Time range pattern spans
127    pub time_range: Option<(DateTime<Utc>, DateTime<Utc>)>,
128
129    /// Related nodes/entities
130    pub entities: Vec<String>,
131
132    /// Description of pattern
133    pub description: String,
134
135    /// Supporting evidence
136    pub evidence: Vec<PatternEvidence>,
137
138    /// Additional metadata
139    pub metadata: HashMap<String, serde_json::Value>,
140}
141
142/// Evidence supporting a pattern
143#[derive(Debug, Clone, Serialize, Deserialize)]
144pub struct PatternEvidence {
145    /// Evidence type
146    pub evidence_type: String,
147
148    /// Numeric value
149    pub value: f64,
150
151    /// Reference to source signal/data
152    pub source_ref: String,
153
154    /// Human-readable explanation
155    pub explanation: String,
156}
157
158/// Discovery engine for pattern detection
159pub struct DiscoveryEngine {
160    config: DiscoveryConfig,
161    patterns: Vec<DiscoveryPattern>,
162    signal_history: Vec<CoherenceSignal>,
163}
164
165impl DiscoveryEngine {
166    /// Create a new discovery engine
167    pub fn new(config: DiscoveryConfig) -> Self {
168        Self {
169            config,
170            patterns: Vec::new(),
171            signal_history: Vec::new(),
172        }
173    }
174
175    /// Detect patterns from coherence signals
176    pub fn detect(&mut self, signals: &[CoherenceSignal]) -> Result<Vec<DiscoveryPattern>> {
177        self.signal_history.extend(signals.iter().cloned());
178
179        let mut patterns = Vec::new();
180
181        // Need at least 2 signals to detect patterns
182        if self.signal_history.len() < 2 {
183            return Ok(patterns);
184        }
185
186        // Detect different pattern types
187        patterns.extend(self.detect_emergence()?);
188        patterns.extend(self.detect_splits()?);
189        patterns.extend(self.detect_bridges()?);
190        patterns.extend(self.detect_trends()?);
191
192        if self.config.detect_anomalies {
193            patterns.extend(self.detect_anomalies()?);
194        }
195
196        self.patterns.extend(patterns.clone());
197        Ok(patterns)
198    }
199
200    /// Detect emerging structures
201    fn detect_emergence(&self) -> Result<Vec<DiscoveryPattern>> {
202        let mut patterns = Vec::new();
203
204        if self.signal_history.len() < self.config.lookback_windows {
205            return Ok(patterns);
206        }
207
208        let recent = &self.signal_history[self.signal_history.len() - self.config.lookback_windows..];
209
210        // Look for sustained growth in node/edge count with increasing coherence
211        let node_growth: Vec<i64> = recent
212            .windows(2)
213            .map(|w| w[1].node_count as i64 - w[0].node_count as i64)
214            .collect();
215
216        let avg_growth = node_growth.iter().sum::<i64>() as f64 / node_growth.len() as f64;
217
218        if avg_growth > self.config.emergence_threshold * recent[0].node_count as f64 {
219            let latest = recent.last().unwrap();
220
221            patterns.push(DiscoveryPattern {
222                id: format!("emergence_{}", self.patterns.len()),
223                category: PatternCategory::Emergence,
224                strength: PatternStrength::from_score(avg_growth / 10.0),
225                confidence: (avg_growth / 10.0).min(1.0),
226                detected_at: Utc::now(),
227                time_range: Some((recent[0].window.start, latest.window.end)),
228                entities: latest.cut_nodes.clone(),
229                description: format!(
230                    "Emerging structure detected: {} new nodes over {} windows",
231                    (avg_growth * recent.len() as f64) as i64,
232                    recent.len()
233                ),
234                evidence: vec![PatternEvidence {
235                    evidence_type: "node_growth".to_string(),
236                    value: avg_growth,
237                    source_ref: latest.id.clone(),
238                    explanation: "Sustained node count growth".to_string(),
239                }],
240                metadata: HashMap::new(),
241            });
242        }
243
244        Ok(patterns)
245    }
246
247    /// Detect structure splits
248    fn detect_splits(&self) -> Result<Vec<DiscoveryPattern>> {
249        let mut patterns = Vec::new();
250
251        if self.signal_history.len() < 2 {
252            return Ok(patterns);
253        }
254
255        // Look for sudden drops in min-cut value
256        for i in 1..self.signal_history.len() {
257            let prev = &self.signal_history[i - 1];
258            let curr = &self.signal_history[i];
259
260            if prev.min_cut_value > 0.0 {
261                let drop_ratio = (prev.min_cut_value - curr.min_cut_value) / prev.min_cut_value;
262
263                if drop_ratio > self.config.split_threshold {
264                    patterns.push(DiscoveryPattern {
265                        id: format!("split_{}", self.patterns.len()),
266                        category: PatternCategory::Split,
267                        strength: PatternStrength::from_score(drop_ratio),
268                        confidence: drop_ratio.min(1.0),
269                        detected_at: curr.window.start,
270                        time_range: Some((prev.window.start, curr.window.end)),
271                        entities: curr.cut_nodes.clone(),
272                        description: format!(
273                            "Structure split detected: {:.1}% coherence drop",
274                            drop_ratio * 100.0
275                        ),
276                        evidence: vec![PatternEvidence {
277                            evidence_type: "mincut_drop".to_string(),
278                            value: drop_ratio,
279                            source_ref: curr.id.clone(),
280                            explanation: format!(
281                                "Min-cut dropped from {:.3} to {:.3}",
282                                prev.min_cut_value, curr.min_cut_value
283                            ),
284                        }],
285                        metadata: HashMap::new(),
286                    });
287                }
288            }
289        }
290
291        Ok(patterns)
292    }
293
294    /// Detect cross-domain bridges
295    fn detect_bridges(&self) -> Result<Vec<DiscoveryPattern>> {
296        let mut patterns = Vec::new();
297
298        if self.signal_history.is_empty() {
299            return Ok(patterns);
300        }
301
302        // Look for nodes that appear in cut boundaries frequently
303        let mut boundary_counts: HashMap<String, usize> = HashMap::new();
304
305        for signal in &self.signal_history {
306            for node in &signal.cut_nodes {
307                *boundary_counts.entry(node.clone()).or_default() += 1;
308            }
309        }
310
311        let threshold = (self.signal_history.len() as f64 * self.config.bridge_threshold) as usize;
312
313        let bridge_nodes: Vec<_> = boundary_counts
314            .iter()
315            .filter(|(_, &count)| count >= threshold)
316            .map(|(node, &count)| (node.clone(), count))
317            .collect();
318
319        if !bridge_nodes.is_empty() {
320            let latest = self.signal_history.last().unwrap();
321
322            patterns.push(DiscoveryPattern {
323                id: format!("bridge_{}", self.patterns.len()),
324                category: PatternCategory::Bridge,
325                strength: PatternStrength::Moderate,
326                confidence: 0.6,
327                detected_at: Utc::now(),
328                time_range: Some((
329                    self.signal_history[0].window.start,
330                    latest.window.end,
331                )),
332                entities: bridge_nodes.iter().map(|(n, _)| n.clone()).collect(),
333                description: format!(
334                    "Bridge nodes detected: {} nodes consistently on boundaries",
335                    bridge_nodes.len()
336                ),
337                evidence: bridge_nodes
338                    .iter()
339                    .map(|(node, count)| PatternEvidence {
340                        evidence_type: "boundary_frequency".to_string(),
341                        value: *count as f64,
342                        source_ref: node.clone(),
343                        explanation: format!("{} appeared in {} cut boundaries", node, count),
344                    })
345                    .collect(),
346                metadata: HashMap::new(),
347            });
348        }
349
350        Ok(patterns)
351    }
352
353    /// Detect trends (consolidation/dissolution)
354    fn detect_trends(&self) -> Result<Vec<DiscoveryPattern>> {
355        let mut patterns = Vec::new();
356
357        if self.signal_history.len() < self.config.lookback_windows {
358            return Ok(patterns);
359        }
360
361        let recent = &self.signal_history[self.signal_history.len() - self.config.lookback_windows..];
362
363        // Calculate trend in min-cut values
364        let values: Vec<f64> = recent.iter().map(|s| s.min_cut_value).collect();
365
366        let (slope, _) = self.linear_regression(&values);
367
368        if slope.abs() > 0.1 {
369            let latest = recent.last().unwrap();
370            let category = if slope > 0.0 {
371                PatternCategory::Consolidation
372            } else {
373                PatternCategory::Dissolution
374            };
375
376            patterns.push(DiscoveryPattern {
377                id: format!("trend_{}", self.patterns.len()),
378                category,
379                strength: PatternStrength::from_score(slope.abs()),
380                confidence: slope.abs().min(1.0),
381                detected_at: Utc::now(),
382                time_range: Some((recent[0].window.start, latest.window.end)),
383                entities: vec![],
384                description: format!(
385                    "{} trend detected: {:.2}% per window",
386                    if slope > 0.0 {
387                        "Strengthening"
388                    } else {
389                        "Weakening"
390                    },
391                    slope * 100.0
392                ),
393                evidence: vec![PatternEvidence {
394                    evidence_type: "trend_slope".to_string(),
395                    value: slope,
396                    source_ref: latest.id.clone(),
397                    explanation: format!(
398                        "Linear trend slope: {:.4} over {} windows",
399                        slope,
400                        recent.len()
401                    ),
402                }],
403                metadata: HashMap::new(),
404            });
405        }
406
407        Ok(patterns)
408    }
409
410    /// Detect anomalies
411    fn detect_anomalies(&self) -> Result<Vec<DiscoveryPattern>> {
412        let mut patterns = Vec::new();
413
414        if self.signal_history.len() < 5 {
415            return Ok(patterns);
416        }
417
418        // Calculate mean and std dev of min-cut values
419        let values: Vec<f64> = self.signal_history.iter().map(|s| s.min_cut_value).collect();
420
421        let mean = values.iter().sum::<f64>() / values.len() as f64;
422        let variance =
423            values.iter().map(|v| (v - mean).powi(2)).sum::<f64>() / values.len() as f64;
424        let std_dev = variance.sqrt();
425
426        // Find anomalies
427        for (i, signal) in self.signal_history.iter().enumerate() {
428            let z_score = if std_dev > 0.0 {
429                (signal.min_cut_value - mean) / std_dev
430            } else {
431                0.0
432            };
433
434            if z_score.abs() > self.config.anomaly_sigma {
435                patterns.push(DiscoveryPattern {
436                    id: format!("anomaly_{}", i),
437                    category: PatternCategory::Anomaly,
438                    strength: PatternStrength::from_score(z_score.abs() / 5.0),
439                    confidence: (z_score.abs() / 5.0).min(1.0),
440                    detected_at: signal.window.start,
441                    time_range: Some((signal.window.start, signal.window.end)),
442                    entities: signal.cut_nodes.clone(),
443                    description: format!(
444                        "Anomalous coherence: {:.2}σ from mean",
445                        z_score
446                    ),
447                    evidence: vec![PatternEvidence {
448                        evidence_type: "z_score".to_string(),
449                        value: z_score,
450                        source_ref: signal.id.clone(),
451                        explanation: format!(
452                            "Value {:.4} vs mean {:.4} (σ={:.4})",
453                            signal.min_cut_value, mean, std_dev
454                        ),
455                    }],
456                    metadata: HashMap::new(),
457                });
458            }
459        }
460
461        Ok(patterns)
462    }
463
464    /// Simple linear regression
465    fn linear_regression(&self, values: &[f64]) -> (f64, f64) {
466        let n = values.len() as f64;
467        let x_mean = (n - 1.0) / 2.0;
468        let y_mean = values.iter().sum::<f64>() / n;
469
470        let mut num = 0.0;
471        let mut denom = 0.0;
472
473        for (i, &y) in values.iter().enumerate() {
474            let x = i as f64;
475            num += (x - x_mean) * (y - y_mean);
476            denom += (x - x_mean).powi(2);
477        }
478
479        let slope = if denom > 0.0 { num / denom } else { 0.0 };
480        let intercept = y_mean - slope * x_mean;
481
482        (slope, intercept)
483    }
484
485    /// Get all discovered patterns
486    pub fn patterns(&self) -> &[DiscoveryPattern] {
487        &self.patterns
488    }
489
490    /// Get patterns by category
491    pub fn patterns_by_category(&self, category: PatternCategory) -> Vec<&DiscoveryPattern> {
492        self.patterns
493            .iter()
494            .filter(|p| p.category == category)
495            .collect()
496    }
497
498    /// Clear history
499    pub fn clear(&mut self) {
500        self.patterns.clear();
501        self.signal_history.clear();
502    }
503}
504
505#[cfg(test)]
506mod tests {
507    use super::*;
508    use crate::TemporalWindow;
509
510    fn make_signal(id: &str, min_cut: f64, nodes: usize) -> CoherenceSignal {
511        CoherenceSignal {
512            id: id.to_string(),
513            window: TemporalWindow::new(Utc::now(), Utc::now(), 0),
514            min_cut_value: min_cut,
515            node_count: nodes,
516            edge_count: nodes * 2,
517            partition_sizes: Some((nodes / 2, nodes - nodes / 2)),
518            is_exact: true,
519            cut_nodes: vec![],
520            delta: None,
521        }
522    }
523
524    #[test]
525    fn test_discovery_engine_creation() {
526        let config = DiscoveryConfig::default();
527        let engine = DiscoveryEngine::new(config);
528        assert!(engine.patterns().is_empty());
529    }
530
531    #[test]
532    fn test_pattern_strength() {
533        assert_eq!(PatternStrength::from_score(0.1), PatternStrength::Weak);
534        assert_eq!(PatternStrength::from_score(0.3), PatternStrength::Moderate);
535        assert_eq!(PatternStrength::from_score(0.6), PatternStrength::Strong);
536        assert_eq!(
537            PatternStrength::from_score(0.9),
538            PatternStrength::VeryStrong
539        );
540    }
541
542    #[test]
543    fn test_empty_signals() {
544        let config = DiscoveryConfig::default();
545        let mut engine = DiscoveryEngine::new(config);
546
547        let patterns = engine.detect(&[]).unwrap();
548        assert!(patterns.is_empty());
549    }
550
551    #[test]
552    fn test_linear_regression() {
553        let config = DiscoveryConfig::default();
554        let engine = DiscoveryEngine::new(config);
555
556        let values = vec![1.0, 2.0, 3.0, 4.0, 5.0];
557        let (slope, intercept) = engine.linear_regression(&values);
558
559        assert!((slope - 1.0).abs() < 0.001);
560        assert!((intercept - 1.0).abs() < 0.001);
561    }
562}