Skip to main content

do_memory_mcp/patterns/predictive/
extraction.rs

1//! # Pattern Extraction Module
2//!
3//! This module extracts patterns from DBSCAN clusters and summarizes their characteristics.
4
5use anyhow::Result;
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8
9use super::dbscan::{Cluster, ClusterLabel};
10
11/// Extracted pattern from a cluster
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct ExtractedPattern {
14    /// Pattern ID
15    pub id: String,
16    /// Cluster ID this pattern was extracted from
17    pub cluster_id: usize,
18    /// Pattern description
19    pub description: String,
20    /// Cluster characteristics
21    pub characteristics: ClusterCharacteristics,
22    /// Pattern quality score (0-1)
23    pub quality_score: f64,
24    /// Pattern type
25    pub pattern_type: PatternType,
26    /// Variable names involved in this pattern
27    pub variables: Vec<String>,
28    /// Temporal range (start, end indices)
29    pub temporal_range: (usize, usize),
30}
31
32/// Cluster characteristics summary
33#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct ClusterCharacteristics {
35    /// Number of points in cluster
36    pub size: usize,
37    /// Cluster centroid
38    pub centroid: Vec<f64>,
39    /// Cluster density
40    pub density: f64,
41    /// Cluster variance
42    pub variance: f64,
43    /// Cluster compactness (inverse of spread)
44    pub compactness: f64,
45    /// Time span of cluster
46    pub time_span: f64,
47}
48
49/// Pattern type classification
50#[derive(Debug, Clone, Serialize, Deserialize)]
51pub enum PatternType {
52    /// Temporal pattern (trend, seasonality)
53    Temporal { pattern: String },
54    /// Anomaly pattern (outliers)
55    Anomaly { severity: String },
56    /// Stable pattern (consistent behavior)
57    Stable { consistency: f64 },
58    /// Transition pattern (change between states)
59    Transition { from: String, to: String },
60    /// Unknown pattern type
61    Unknown,
62}
63
64/// Pattern extraction configuration
65#[derive(Debug, Clone)]
66pub struct ExtractionConfig {
67    /// Minimum cluster quality threshold
68    pub min_quality: f64,
69    /// Minimum cluster size
70    pub min_cluster_size: usize,
71    /// Whether to generate detailed descriptions
72    pub verbose: bool,
73}
74
75impl Default for ExtractionConfig {
76    fn default() -> Self {
77        Self {
78            min_quality: 0.6,
79            min_cluster_size: 3,
80            verbose: true,
81        }
82    }
83}
84
85/// Pattern extraction engine
86pub struct PatternExtractor {
87    config: ExtractionConfig,
88}
89
90impl PatternExtractor {
91    /// Create a new pattern extractor
92    pub fn new(config: ExtractionConfig) -> Self {
93        Self { config }
94    }
95
96    /// Create with default configuration
97    pub fn default_config() -> Self {
98        Self::new(ExtractionConfig::default())
99    }
100
101    /// Extract patterns from DBSCAN clusters
102    pub fn extract_patterns(
103        &self,
104        clusters: &[Cluster],
105        labels: &[ClusterLabel],
106        variable_names: &[String],
107    ) -> Result<Vec<ExtractedPattern>> {
108        let mut patterns = Vec::new();
109
110        for (cluster_idx, cluster) in clusters.iter().enumerate() {
111            // Check if cluster meets quality criteria
112            if cluster.points.len() < self.config.min_cluster_size {
113                continue;
114            }
115
116            let characteristics = self.compute_cluster_characteristics(cluster)?;
117
118            let quality_score = self.compute_quality_score(cluster, &characteristics);
119
120            if quality_score < self.config.min_quality {
121                continue;
122            }
123
124            let pattern_type = self.classify_pattern_type(cluster, &characteristics);
125
126            let description = if self.config.verbose {
127                self.generate_detailed_description(cluster, &characteristics, &pattern_type)
128            } else {
129                self.generate_simple_description(cluster, &pattern_type)
130            };
131
132            let temporal_range = self.compute_temporal_range(cluster);
133
134            let pattern = ExtractedPattern {
135                id: format!("pattern_{}", cluster_idx),
136                cluster_id: cluster_idx,
137                description,
138                characteristics,
139                quality_score,
140                pattern_type,
141                variables: variable_names.to_vec(),
142                temporal_range,
143            };
144
145            patterns.push(pattern);
146        }
147
148        // Also extract noise patterns (anomalies)
149        let noise_patterns = self.extract_noise_patterns(labels, variable_names)?;
150        patterns.extend(noise_patterns);
151
152        Ok(patterns)
153    }
154
155    /// Compute cluster characteristics
156    fn compute_cluster_characteristics(&self, cluster: &Cluster) -> Result<ClusterCharacteristics> {
157        if cluster.points.is_empty() {
158            anyhow::bail!("Cannot compute characteristics for empty cluster");
159        }
160
161        // Size
162        let size = cluster.points.len();
163
164        // Centroid (already computed)
165        let centroid = cluster.centroid.clone();
166
167        // Density
168        let density = cluster.density;
169
170        // Variance (average squared distance from centroid)
171        let variance = if size > 1 {
172            cluster
173                .points
174                .iter()
175                .map(|p| {
176                    p.features
177                        .iter()
178                        .zip(&centroid)
179                        .map(|(&x, &c)| (x - c).powi(2))
180                        .sum::<f64>()
181                })
182                .sum::<f64>()
183                / size as f64
184        } else {
185            0.0
186        };
187
188        // Compactness (inverse of variance, normalized)
189        let compactness = if variance > 0.0 {
190            1.0 / (1.0 + variance)
191        } else {
192            1.0
193        };
194
195        // Time span
196        let time_span = if size > 1 {
197            let min_time = cluster
198                .points
199                .iter()
200                .map(|p| p.timestamp)
201                .fold(f64::INFINITY, f64::min);
202            let max_time = cluster
203                .points
204                .iter()
205                .map(|p| p.timestamp)
206                .fold(f64::NEG_INFINITY, f64::max);
207            max_time - min_time
208        } else {
209            0.0
210        };
211
212        Ok(ClusterCharacteristics {
213            size,
214            centroid,
215            density,
216            variance,
217            compactness,
218            time_span,
219        })
220    }
221
222    /// Compute quality score for a cluster
223    fn compute_quality_score(
224        &self,
225        cluster: &Cluster,
226        characteristics: &ClusterCharacteristics,
227    ) -> f64 {
228        // Quality factors:
229        // 1. Size (more points = better, up to a threshold)
230        let size_score = (cluster.points.len() as f64).ln() / 10.0;
231        let size_score = size_score.clamp(0.0, 1.0);
232
233        // 2. Density (higher density = better)
234        let density_score = characteristics.density.clamp(0.0, 1.0);
235
236        // 3. Compactness (higher compactness = better)
237        let compactness_score = characteristics.compactness;
238
239        // 4. Stability (lower variance = better)
240        let stability_score = 1.0 / (1.0 + characteristics.variance);
241
242        // Weighted combination
243        0.3 * size_score + 0.3 * density_score + 0.2 * compactness_score + 0.2 * stability_score
244    }
245
246    /// Classify pattern type
247    fn classify_pattern_type(
248        &self,
249        cluster: &Cluster,
250        characteristics: &ClusterCharacteristics,
251    ) -> PatternType {
252        // Check for temporal pattern
253        if characteristics.time_span > cluster.points.len() as f64 * 0.5 {
254            // Cluster spans significant time
255
256            // Check for trend
257            let first_point = &cluster.points[0];
258            let last_point = &cluster.points[cluster.points.len() - 1];
259
260            if !first_point.features.is_empty() && !last_point.features.is_empty() {
261                let trend = last_point.features[0] - first_point.features[0];
262
263                if trend.abs() > 0.1 {
264                    return PatternType::Temporal {
265                        pattern: if trend > 0.0 {
266                            "increasing_trend".to_string()
267                        } else {
268                            "decreasing_trend".to_string()
269                        },
270                    };
271                }
272            }
273
274            return PatternType::Temporal {
275                pattern: "temporal_pattern".to_string(),
276            };
277        }
278
279        // Check for stable pattern
280        if characteristics.compactness > 0.8 && characteristics.variance < 0.5 {
281            return PatternType::Stable {
282                consistency: characteristics.compactness,
283            };
284        }
285
286        // Default to unknown
287        PatternType::Unknown
288    }
289
290    /// Generate detailed description
291    fn generate_detailed_description(
292        &self,
293        cluster: &Cluster,
294        characteristics: &ClusterCharacteristics,
295        pattern_type: &PatternType,
296    ) -> String {
297        let mut desc = String::new();
298
299        desc.push_str(&format!(
300            "Cluster {} contains {} points with density {:.2}. ",
301            cluster.id, characteristics.size, characteristics.density
302        ));
303
304        desc.push_str(&format!(
305            "Centroid: {:?}, Variance: {:.2}, Compactness: {:.2}. ",
306            characteristics.centroid, characteristics.variance, characteristics.compactness
307        ));
308
309        match pattern_type {
310            PatternType::Temporal { pattern } => {
311                desc.push_str(&format!("Pattern type: Temporal ({})", pattern));
312            }
313            PatternType::Anomaly { severity } => {
314                desc.push_str(&format!("Pattern type: Anomaly (severity: {})", severity));
315            }
316            PatternType::Stable { consistency } => {
317                desc.push_str(&format!(
318                    "Pattern type: Stable (consistency: {:.2})",
319                    consistency
320                ));
321            }
322            PatternType::Transition { from, to } => {
323                desc.push_str(&format!("Pattern type: Transition ({} -> {})", from, to));
324            }
325            PatternType::Unknown => {
326                desc.push_str("Pattern type: Unknown");
327            }
328        }
329
330        desc
331    }
332
333    /// Generate simple description
334    fn generate_simple_description(&self, cluster: &Cluster, pattern_type: &PatternType) -> String {
335        match pattern_type {
336            PatternType::Temporal { pattern } => {
337                format!(
338                    "Temporal pattern: {} ({} points)",
339                    pattern,
340                    cluster.points.len()
341                )
342            }
343            PatternType::Anomaly { severity } => {
344                format!(
345                    "Anomaly detected: {} ({} points)",
346                    severity,
347                    cluster.points.len()
348                )
349            }
350            PatternType::Stable { consistency } => {
351                format!(
352                    "Stable pattern: {:.2}% consistency ({} points)",
353                    consistency * 100.0,
354                    cluster.points.len()
355                )
356            }
357            PatternType::Transition { from, to } => {
358                format!(
359                    "Transition: {} -> {} ({} points)",
360                    from,
361                    to,
362                    cluster.points.len()
363                )
364            }
365            PatternType::Unknown => {
366                format!(
367                    "Cluster {} with {} points",
368                    cluster.id,
369                    cluster.points.len()
370                )
371            }
372        }
373    }
374
375    /// Compute temporal range of a cluster
376    fn compute_temporal_range(&self, cluster: &Cluster) -> (usize, usize) {
377        if cluster.points.is_empty() {
378            return (0, 0);
379        }
380
381        let min_idx = cluster.points.iter().map(|p| p.id).min().unwrap_or(0);
382        let max_idx = cluster.points.iter().map(|p| p.id).max().unwrap_or(0);
383
384        (min_idx, max_idx)
385    }
386
387    /// Extract noise patterns (anomalies)
388    fn extract_noise_patterns(
389        &self,
390        labels: &[ClusterLabel],
391        variable_names: &[String],
392    ) -> Result<Vec<ExtractedPattern>> {
393        let mut noise_indices = Vec::new();
394
395        for (i, label) in labels.iter().enumerate() {
396            if matches!(label, ClusterLabel::Noise) {
397                noise_indices.push(i);
398            }
399        }
400
401        if noise_indices.is_empty() {
402            return Ok(Vec::new());
403        }
404
405        // Create anomaly pattern
406        let pattern = ExtractedPattern {
407            id: "anomaly_pattern".to_string(),
408            cluster_id: usize::MAX,
409            description: format!(
410                "Detected {} anomaly points across {} variables",
411                noise_indices.len(),
412                variable_names.len()
413            ),
414            characteristics: ClusterCharacteristics {
415                size: noise_indices.len(),
416                centroid: vec![0.0], // No meaningful centroid for noise
417                density: 0.0,
418                variance: f64::INFINITY,
419                compactness: 0.0,
420                time_span: 0.0,
421            },
422            quality_score: 0.5,
423            pattern_type: PatternType::Anomaly {
424                severity: if noise_indices.len() > 5 {
425                    "high".to_string()
426                } else if noise_indices.len() > 2 {
427                    "medium".to_string()
428                } else {
429                    "low".to_string()
430                },
431            },
432            variables: variable_names.to_vec(),
433            temporal_range: (
434                *noise_indices.first().unwrap_or(&0),
435                *noise_indices.last().unwrap_or(&0),
436            ),
437        };
438
439        Ok(vec![pattern])
440    }
441
442    /// Filter patterns by quality threshold
443    pub fn filter_by_quality(&self, patterns: &[ExtractedPattern]) -> Vec<ExtractedPattern> {
444        patterns
445            .iter()
446            .filter(|p| p.quality_score >= self.config.min_quality)
447            .cloned()
448            .collect()
449    }
450
451    /// Get pattern statistics
452    pub fn get_pattern_stats(&self, patterns: &[ExtractedPattern]) -> HashMap<String, f64> {
453        let mut stats = HashMap::new();
454
455        stats.insert("total_patterns".to_string(), patterns.len() as f64);
456
457        let temporal_count = patterns
458            .iter()
459            .filter(|p| matches!(p.pattern_type, PatternType::Temporal { .. }))
460            .count();
461        stats.insert("temporal_patterns".to_string(), temporal_count as f64);
462
463        let anomaly_count = patterns
464            .iter()
465            .filter(|p| matches!(p.pattern_type, PatternType::Anomaly { .. }))
466            .count();
467        stats.insert("anomaly_patterns".to_string(), anomaly_count as f64);
468
469        let stable_count = patterns
470            .iter()
471            .filter(|p| matches!(p.pattern_type, PatternType::Stable { .. }))
472            .count();
473        stats.insert("stable_patterns".to_string(), stable_count as f64);
474
475        let avg_quality = if patterns.is_empty() {
476            0.0
477        } else {
478            patterns.iter().map(|p| p.quality_score).sum::<f64>() / patterns.len() as f64
479        };
480        stats.insert("average_quality".to_string(), avg_quality);
481
482        stats
483    }
484}
485
486#[cfg(test)]
487mod tests;