Skip to main content

do_memory_mcp/patterns/predictive/
dbscan.rs

1use anyhow::Result;
2use serde::Serialize;
3
4use super::kdtree::{KDTree, Point};
5
6#[derive(Debug, Clone, Serialize)]
7pub struct Cluster {
8    pub id: usize,
9    pub points: Vec<Point>,
10    pub centroid: Vec<f64>,
11    pub density: f64,
12}
13
14/// Cluster label for DBSCAN results
15#[derive(Debug, Clone, Copy, PartialEq)]
16pub enum ClusterLabel {
17    /// Noise point (anomaly)
18    Noise,
19    /// Cluster ID
20    Cluster(usize),
21}
22
23/// Density-adaptive DBSCAN configuration
24#[derive(Debug, Clone)]
25pub struct DBSCANConfig {
26    /// Density parameter (replaces eps/MinPts)
27    pub density: f64,
28    /// Minimum cluster size for validation
29    pub min_cluster_size: usize,
30    /// Maximum distance for neighbors
31    pub max_distance: f64,
32    /// Window size for streaming data
33    pub window_size: usize,
34}
35
36impl Default for DBSCANConfig {
37    fn default() -> Self {
38        Self {
39            density: 0.1,
40            min_cluster_size: 3,
41            max_distance: 1.0,
42            window_size: 1000,
43        }
44    }
45}
46
47/// Streaming cluster state for incremental updates
48#[derive(Debug)]
49pub struct StreamingClusters {
50    pub clusters: Vec<Cluster>,
51    pub kd_tree: KDTree,
52    pub window: Vec<Point>,
53    pub config: DBSCANConfig,
54}
55
56impl StreamingClusters {
57    pub fn new(config: DBSCANConfig) -> Self {
58        Self {
59            clusters: Vec::new(),
60            kd_tree: KDTree::new(),
61            window: Vec::with_capacity(config.window_size),
62            config,
63        }
64    }
65
66    /// Update clusters with new point
67    pub fn update(&mut self, new_point: Point) -> ClusterLabel {
68        // Add to window and maintain size
69        self.window.push(new_point.clone());
70        if self.window.len() > self.config.window_size {
71            self.window.remove(0);
72        }
73
74        // Rebuild KD-tree with current window
75        self.kd_tree = KDTree::build(&self.window);
76
77        // Calculate local density for the new point
78        let local_density = self.calculate_local_density(&new_point);
79
80        // Check if point is noise or part of cluster
81        if local_density < self.config.density {
82            ClusterLabel::Noise
83        } else {
84            // Find or assign to cluster
85            self.assign_to_cluster(new_point, local_density)
86        }
87    }
88
89    /// Calculate local density around a point
90    fn calculate_local_density(&self, point: &Point) -> f64 {
91        let neighbors = self
92            .kd_tree
93            .find_neighbors(&point.features, self.config.max_distance);
94
95        // Exclude the point itself (distance 0) from density calculation.
96        let filtered: Vec<Point> = neighbors.into_iter().filter(|p| p.id != point.id).collect();
97
98        // Calculate density based on neighbor count and distances
99        if filtered.is_empty() {
100            0.0
101        } else {
102            let avg_distance: f64 = filtered
103                .iter()
104                .map(|neighbor| calculate_distance(&point.features, &neighbor.features))
105                .sum::<f64>()
106                / filtered.len() as f64;
107
108            // Higher density = more neighbors + closer proximity
109            filtered.len() as f64 / (1.0 + avg_distance)
110        }
111    }
112
113    /// Assign point to existing cluster or create new one
114    fn assign_to_cluster(&mut self, point: Point, density: f64) -> ClusterLabel {
115        // Find nearest cluster centroid
116        let mut nearest_cluster = None;
117        let mut min_distance = f64::INFINITY;
118
119        for (i, cluster) in self.clusters.iter().enumerate() {
120            let distance = calculate_distance(&point.features, &cluster.centroid);
121            if distance < min_distance && distance <= self.config.max_distance {
122                min_distance = distance;
123                nearest_cluster = Some(i);
124            }
125        }
126
127        if let Some(cluster_idx) = nearest_cluster {
128            // Add to existing cluster
129            self.clusters[cluster_idx].points.push(point.clone());
130            self.update_cluster_centroid(cluster_idx);
131            ClusterLabel::Cluster(cluster_idx)
132        } else {
133            // Create new cluster
134            let new_cluster = Cluster {
135                id: self.clusters.len(),
136                points: vec![point.clone()],
137                centroid: point.features.clone(),
138                density,
139            };
140            self.clusters.push(new_cluster);
141            ClusterLabel::Cluster(self.clusters.len() - 1)
142        }
143    }
144
145    /// Update cluster centroid after adding points
146    fn update_cluster_centroid(&mut self, cluster_idx: usize) {
147        if self.clusters[cluster_idx].points.is_empty() {
148            return;
149        }
150
151        let cluster = &self.clusters[cluster_idx];
152        let feature_count = cluster.points[0].features.len();
153        let mut new_centroid = vec![0.0; feature_count];
154
155        for point in &cluster.points {
156            for (i, &feature) in point.features.iter().enumerate() {
157                new_centroid[i] += feature;
158            }
159        }
160
161        for centroid_val in &mut new_centroid {
162            *centroid_val /= cluster.points.len() as f64;
163        }
164
165        self.clusters[cluster_idx].centroid = new_centroid;
166    }
167
168    /// Calculate local density maps for the entire dataset
169    pub fn calculate_density_maps(&self) -> Vec<f64> {
170        let mut density_map = Vec::with_capacity(self.window.len());
171
172        for point in &self.window {
173            let density = self.calculate_local_density(point);
174            density_map.push(density);
175        }
176
177        density_map
178    }
179}
180
181/// Adaptive DBSCAN anomaly detector
182#[derive(Debug)]
183#[allow(dead_code)]
184pub struct AdaptiveDBSCAN {
185    config: DBSCANConfig,
186    streaming_clusters: StreamingClusters,
187}
188
189impl AdaptiveDBSCAN {
190    pub fn new(config: DBSCANConfig) -> Result<Self> {
191        Ok(Self {
192            streaming_clusters: StreamingClusters::new(config.clone()),
193            config,
194        })
195    }
196
197    /// Main DBSCAN anomaly detection function
198    pub fn detect_anomalies_dbscan(
199        &mut self,
200        values: &[f64],
201        timestamps: &[f64],
202    ) -> Vec<ClusterLabel> {
203        // Build a point set and run (adaptive) DBSCAN in batch for deterministic results.
204        // This avoids streaming-order artifacts and improves anomaly detection for small series.
205        let points: Vec<Point> = values
206            .iter()
207            .enumerate()
208            .map(|(i, &value)| {
209                let timestamp = timestamps.get(i).copied().unwrap_or(i as f64);
210                Point {
211                    id: i,
212                    values: vec![value],
213                    embedding: None,
214                    timestamp,
215                    features: vec![value],
216                }
217            })
218            .collect();
219
220        self.adaptive_dbscan_clustering(&points)
221    }
222
223    /// Adaptive DBSCAN clustering with density-based parameter optimization
224    pub fn adaptive_dbscan_clustering(&mut self, points: &[Point]) -> Vec<ClusterLabel> {
225        // Calculate adaptive parameters based on data distribution
226        let adaptive_params = self.calculate_adaptive_parameters(points);
227
228        // Apply DBSCAN with adaptive parameters
229        self.apply_dbscan(points, adaptive_params)
230    }
231
232    /// Calculate adaptive parameters using metaheuristic optimization
233    fn calculate_adaptive_parameters(&self, points: &[Point]) -> (f64, usize) {
234        if points.len() < 3 {
235            return (0.5, 2);
236        }
237
238        // Extract features for analysis
239        let features: Vec<Vec<f64>> = points.iter().map(|p| p.features.clone()).collect();
240
241        // Calculate feature statistics
242        let mut all_values = Vec::new();
243        for feature_vec in &features {
244            all_values.extend(feature_vec);
245        }
246
247        let mean: f64 = all_values.iter().sum::<f64>() / all_values.len() as f64;
248        let variance: f64 =
249            all_values.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / all_values.len() as f64;
250        let std_dev = variance.sqrt();
251
252        // Adaptive epsilon based on data distribution
253        // Ensure it's strictly positive so callers can rely on `epsilon > 0.0`.
254        let adaptive_epsilon = (std_dev * 2.0).max(1e-6); // 2 std dev, floored
255        let adaptive_min_samples = (points.len() as f64 * 0.1).max(3.0) as usize; // 10% of data or minimum 3
256
257        (
258            adaptive_epsilon,
259            adaptive_min_samples.min(points.len().saturating_sub(1)),
260        )
261    }
262
263    /// Apply DBSCAN algorithm with given parameters
264    fn apply_dbscan(&self, points: &[Point], params: (f64, usize)) -> Vec<ClusterLabel> {
265        let (epsilon, min_samples) = params;
266
267        if points.is_empty() {
268            return Vec::new();
269        }
270
271        // Build KD-tree for efficient neighbor queries
272        let kd_tree = KDTree::build(points);
273
274        let mut labels = vec![ClusterLabel::Noise; points.len()];
275        let mut cluster_id = 0;
276
277        for (i, point) in points.iter().enumerate() {
278            if !matches!(labels[i], ClusterLabel::Noise) {
279                continue; // Already processed
280            }
281
282            // Find neighbors
283            let neighbors = kd_tree.find_neighbors(&point.features, epsilon);
284
285            if neighbors.len() < min_samples {
286                labels[i] = ClusterLabel::Noise; // Mark as noise
287            } else {
288                // Start new cluster
289                labels[i] = ClusterLabel::Cluster(cluster_id);
290                let mut cluster_points = vec![i];
291
292                // Expand cluster
293                let mut queue = neighbors.iter().map(|n| n.id).collect::<Vec<_>>();
294
295                while let Some(neighbor_id) = queue.pop() {
296                    if matches!(labels[neighbor_id], ClusterLabel::Noise) {
297                        labels[neighbor_id] = ClusterLabel::Cluster(cluster_id);
298                        cluster_points.push(neighbor_id);
299
300                        // Add neighbors of this point to queue
301                        let neighbor_neighbors =
302                            kd_tree.find_neighbors(&points[neighbor_id].features, epsilon);
303
304                        for neighbor_neighbor in &neighbor_neighbors {
305                            if matches!(labels[neighbor_neighbor.id], ClusterLabel::Noise) {
306                                queue.push(neighbor_neighbor.id);
307                            }
308                        }
309                    }
310                }
311
312                cluster_id += 1;
313            }
314        }
315
316        labels
317    }
318
319    /// Update streaming clusters incrementally
320    pub fn update_streaming_clusters(&mut self, new_point: Point) -> ClusterLabel {
321        self.streaming_clusters.update(new_point)
322    }
323
324    /// Get current density maps
325    pub fn get_density_maps(&self) -> Vec<f64> {
326        self.streaming_clusters.calculate_density_maps()
327    }
328}
329
330/// Calculate Euclidean distance between two feature vectors
331fn calculate_distance(a: &[f64], b: &[f64]) -> f64 {
332    let len = a.len().min(b.len());
333    if len == 0 {
334        return 0.0;
335    }
336
337    let mut sum = 0.0;
338    for i in 0..len {
339        let diff = a[i] - b[i];
340        sum += diff * diff;
341    }
342    sum.sqrt()
343}