1use std::collections::HashMap;
8
9use ndarray::Array2;
10use thiserror::Error;
11use tracing::{debug, info, instrument, warn};
12
13use crate::domain::entities::{
14 Anomaly, AnomalyType, Cluster, ClusterId, EmbeddingId, Motif, MotifOccurrence, Prototype,
15 RecordingId, SegmentId, SequenceAnalysis,
16};
17use crate::domain::value_objects::{
18 ClusteringConfig, ClusteringMethod, ClusteringResult, DistanceMetric, MotifConfig,
19 SequenceMetrics, TransitionMatrix,
20};
21use crate::infrastructure::{HdbscanClusterer, KMeansClusterer, MarkovAnalyzer};
22use crate::metrics::SilhouetteScore;
23
24#[derive(Debug, Error)]
26pub enum AnalysisError {
27 #[error("Insufficient data: {0}")]
29 InsufficientData(String),
30
31 #[error("Invalid configuration: {0}")]
33 InvalidConfig(String),
34
35 #[error("Clustering failed: {0}")]
37 ClusteringFailed(String),
38
39 #[error("Motif detection failed: {0}")]
41 MotifDetectionFailed(String),
42
43 #[error("Sequence analysis failed: {0}")]
45 SequenceAnalysisFailed(String),
46
47 #[error("Anomaly detection failed: {0}")]
49 AnomalyDetectionFailed(String),
50
51 #[error("Computation error: {0}")]
53 ComputationError(String),
54
55 #[error("Infrastructure error: {0}")]
57 Infrastructure(String),
58}
59
60pub type Result<T> = std::result::Result<T, AnalysisError>;
62
63pub type EmbeddingWithId = (EmbeddingId, Vec<f32>);
65
66pub struct ClusteringService {
71 config: ClusteringConfig,
73}
74
75impl ClusteringService {
76 #[must_use]
78 pub fn new(config: ClusteringConfig) -> Self {
79 Self { config }
80 }
81
82 #[must_use]
84 pub fn default_service() -> Self {
85 Self::new(ClusteringConfig::default())
86 }
87
88 #[instrument(skip(self, embeddings), fields(n_embeddings = embeddings.len()))]
98 pub async fn run_hdbscan(
99 &self,
100 embeddings: &[EmbeddingWithId],
101 ) -> Result<Vec<Cluster>> {
102 if embeddings.is_empty() {
103 return Err(AnalysisError::InsufficientData(
104 "Cannot cluster empty embedding set".to_string(),
105 ));
106 }
107
108 if embeddings.len() < self.config.parameters.min_cluster_size {
109 return Err(AnalysisError::InsufficientData(format!(
110 "Need at least {} embeddings for HDBSCAN, got {}",
111 self.config.parameters.min_cluster_size,
112 embeddings.len()
113 )));
114 }
115
116 info!(
117 n_embeddings = embeddings.len(),
118 min_cluster_size = self.config.parameters.min_cluster_size,
119 min_samples = self.config.parameters.min_samples,
120 "Starting HDBSCAN clustering"
121 );
122
123 let dim = embeddings[0].1.len();
125 let n = embeddings.len();
126 let mut matrix = Array2::<f32>::zeros((n, dim));
127
128 for (i, (_, vec)) in embeddings.iter().enumerate() {
129 if vec.len() != dim {
130 return Err(AnalysisError::InvalidConfig(format!(
131 "Embedding dimension mismatch: expected {}, got {}",
132 dim,
133 vec.len()
134 )));
135 }
136 for (j, &val) in vec.iter().enumerate() {
137 matrix[[i, j]] = val;
138 }
139 }
140
141 let clusterer = HdbscanClusterer::new(
143 self.config.parameters.min_cluster_size,
144 self.config.parameters.min_samples,
145 self.config.parameters.metric,
146 );
147
148 let labels = clusterer.fit(&matrix)?;
149
150 let clusters = self.labels_to_clusters(embeddings, &labels)?;
152
153 info!(
154 n_clusters = clusters.len(),
155 "HDBSCAN clustering completed"
156 );
157
158 Ok(clusters)
159 }
160
161 #[instrument(skip(self, embeddings), fields(n_embeddings = embeddings.len(), k = k))]
172 pub async fn run_kmeans(
173 &self,
174 embeddings: &[EmbeddingWithId],
175 k: usize,
176 ) -> Result<Vec<Cluster>> {
177 if embeddings.is_empty() {
178 return Err(AnalysisError::InsufficientData(
179 "Cannot cluster empty embedding set".to_string(),
180 ));
181 }
182
183 if embeddings.len() < k {
184 return Err(AnalysisError::InsufficientData(format!(
185 "Need at least {} embeddings for K-means with k={}, got {}",
186 k,
187 k,
188 embeddings.len()
189 )));
190 }
191
192 info!(n_embeddings = embeddings.len(), k = k, "Starting K-means clustering");
193
194 let dim = embeddings[0].1.len();
196 let n = embeddings.len();
197 let mut matrix = Array2::<f32>::zeros((n, dim));
198
199 for (i, (_, vec)) in embeddings.iter().enumerate() {
200 for (j, &val) in vec.iter().enumerate() {
201 matrix[[i, j]] = val;
202 }
203 }
204
205 let clusterer = KMeansClusterer::new(k, self.config.random_seed);
207 let (labels, centroids) = clusterer.fit(&matrix)?;
208
209 let clusters = self.labels_to_clusters_with_centroids(
211 embeddings,
212 &labels,
213 ¢roids,
214 )?;
215
216 info!(n_clusters = clusters.len(), "K-means clustering completed");
217
218 Ok(clusters)
219 }
220
221 #[instrument(skip(self, embedding, clusters), fields(n_clusters = clusters.len()))]
232 pub async fn assign_to_nearest(
233 &self,
234 embedding: &[f32],
235 clusters: &[Cluster],
236 ) -> Result<ClusterId> {
237 if clusters.is_empty() {
238 return Err(AnalysisError::InsufficientData(
239 "No clusters available for assignment".to_string(),
240 ));
241 }
242
243 let mut min_distance = f32::MAX;
244 let mut nearest_cluster = clusters[0].id;
245
246 for cluster in clusters {
247 let distance = self.compute_distance(embedding, &cluster.centroid);
248 if distance < min_distance {
249 min_distance = distance;
250 nearest_cluster = cluster.id;
251 }
252 }
253
254 debug!(
255 cluster_id = %nearest_cluster,
256 distance = min_distance,
257 "Assigned embedding to nearest cluster"
258 );
259
260 Ok(nearest_cluster)
261 }
262
263 #[instrument(skip(self, cluster, embeddings), fields(cluster_id = %cluster.id))]
274 pub async fn compute_prototypes(
275 &self,
276 cluster: &Cluster,
277 embeddings: &HashMap<EmbeddingId, Vec<f32>>,
278 ) -> Result<Vec<Prototype>> {
279 if cluster.member_ids.is_empty() {
280 return Ok(Vec::new());
281 }
282
283 let n_prototypes = self.config.prototypes_per_cluster.min(cluster.member_ids.len());
284 let mut scored_members: Vec<(EmbeddingId, f32)> = Vec::new();
285
286 for &member_id in &cluster.member_ids {
288 if let Some(vec) = embeddings.get(&member_id) {
289 let distance = self.compute_distance(vec, &cluster.centroid);
290 let score = 1.0 / (1.0 + distance); scored_members.push((member_id, score));
292 }
293 }
294
295 scored_members.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
297
298 let prototypes: Vec<Prototype> = scored_members
300 .into_iter()
301 .take(n_prototypes)
302 .map(|(id, score)| Prototype::new(id, cluster.id, score))
303 .collect();
304
305 debug!(
306 cluster_id = %cluster.id,
307 n_prototypes = prototypes.len(),
308 "Computed cluster prototypes"
309 );
310
311 Ok(prototypes)
312 }
313
314 #[instrument(skip(self, embeddings))]
316 pub async fn cluster_with_metrics(
317 &self,
318 embeddings: &[EmbeddingWithId],
319 ) -> Result<ClusteringResult> {
320 let clusters = match &self.config.method {
321 ClusteringMethod::HDBSCAN => self.run_hdbscan(embeddings).await?,
322 ClusteringMethod::KMeans { k } => self.run_kmeans(embeddings, *k).await?,
323 _ => {
324 return Err(AnalysisError::InvalidConfig(
325 "Unsupported clustering method".to_string(),
326 ))
327 }
328 };
329
330 let assigned: std::collections::HashSet<_> = clusters
332 .iter()
333 .flat_map(|c| c.member_ids.iter())
334 .copied()
335 .collect();
336
337 let noise: Vec<EmbeddingId> = embeddings
338 .iter()
339 .map(|(id, _)| *id)
340 .filter(|id| !assigned.contains(id))
341 .collect();
342
343 let silhouette_score = if self.config.compute_silhouette && !clusters.is_empty() {
345 let labels = self.clusters_to_labels(&clusters, embeddings);
346 let matrix = self.embeddings_to_matrix(embeddings);
347 Some(SilhouetteScore::new().compute(&matrix, &labels))
348 } else {
349 None
350 };
351
352 let embedding_map: HashMap<EmbeddingId, Vec<f32>> = embeddings
354 .iter()
355 .map(|(id, vec)| (*id, vec.clone()))
356 .collect();
357
358 let mut prototypes = Vec::new();
359 if self.config.compute_prototypes {
360 for cluster in &clusters {
361 let cluster_prototypes = self.compute_prototypes(cluster, &embedding_map).await?;
362 prototypes.extend(cluster_prototypes);
363 }
364 }
365
366 Ok(ClusteringResult {
367 clusters,
368 noise,
369 silhouette_score,
370 v_measure: None,
371 prototypes,
372 parameters: self.config.parameters.clone(),
373 method: self.config.method.clone(),
374 })
375 }
376
377 fn labels_to_clusters(
380 &self,
381 embeddings: &[EmbeddingWithId],
382 labels: &[i32],
383 ) -> Result<Vec<Cluster>> {
384 let mut cluster_members: HashMap<i32, Vec<EmbeddingId>> = HashMap::new();
385 let mut cluster_vectors: HashMap<i32, Vec<Vec<f32>>> = HashMap::new();
386
387 for ((id, vec), &label) in embeddings.iter().zip(labels.iter()) {
388 if label >= 0 {
389 cluster_members.entry(label).or_default().push(*id);
390 cluster_vectors.entry(label).or_default().push(vec.clone());
391 }
392 }
393
394 let mut clusters = Vec::new();
395 for (label, member_ids) in cluster_members {
396 let vectors = &cluster_vectors[&label];
397 let centroid = self.compute_centroid(vectors);
398 let variance = self.compute_variance(vectors, ¢roid);
399
400 let prototype_id = member_ids
401 .iter()
402 .min_by(|a, b| {
403 let idx_a = embeddings.iter().position(|(id, _)| id == *a).unwrap();
404 let idx_b = embeddings.iter().position(|(id, _)| id == *b).unwrap();
405 let dist_a = self.compute_distance(&embeddings[idx_a].1, ¢roid);
406 let dist_b = self.compute_distance(&embeddings[idx_b].1, ¢roid);
407 dist_a.partial_cmp(&dist_b).unwrap_or(std::cmp::Ordering::Equal)
408 })
409 .copied()
410 .unwrap_or_else(EmbeddingId::new);
411
412 clusters.push(Cluster::new(prototype_id, member_ids, centroid, variance));
413 }
414
415 Ok(clusters)
416 }
417
418 fn labels_to_clusters_with_centroids(
419 &self,
420 embeddings: &[EmbeddingWithId],
421 labels: &[usize],
422 centroids: &Array2<f32>,
423 ) -> Result<Vec<Cluster>> {
424 let k = centroids.nrows();
425 let mut cluster_members: Vec<Vec<EmbeddingId>> = vec![Vec::new(); k];
426 let mut cluster_vectors: Vec<Vec<Vec<f32>>> = vec![Vec::new(); k];
427
428 for ((id, vec), &label) in embeddings.iter().zip(labels.iter()) {
429 if label < k {
430 cluster_members[label].push(*id);
431 cluster_vectors[label].push(vec.clone());
432 }
433 }
434
435 let mut clusters = Vec::new();
436 for (i, member_ids) in cluster_members.into_iter().enumerate() {
437 if member_ids.is_empty() {
438 continue;
439 }
440
441 let centroid: Vec<f32> = centroids.row(i).to_vec();
442 let variance = self.compute_variance(&cluster_vectors[i], ¢roid);
443
444 let prototype_id = member_ids
445 .iter()
446 .min_by(|a, b| {
447 let idx_a = embeddings.iter().position(|(id, _)| id == *a).unwrap();
448 let idx_b = embeddings.iter().position(|(id, _)| id == *b).unwrap();
449 let dist_a = self.compute_distance(&embeddings[idx_a].1, ¢roid);
450 let dist_b = self.compute_distance(&embeddings[idx_b].1, ¢roid);
451 dist_a.partial_cmp(&dist_b).unwrap_or(std::cmp::Ordering::Equal)
452 })
453 .copied()
454 .unwrap_or_else(EmbeddingId::new);
455
456 clusters.push(Cluster::new(prototype_id, member_ids, centroid, variance));
457 }
458
459 Ok(clusters)
460 }
461
462 fn compute_centroid(&self, vectors: &[Vec<f32>]) -> Vec<f32> {
463 if vectors.is_empty() {
464 return Vec::new();
465 }
466
467 let dim = vectors[0].len();
468 let n = vectors.len() as f32;
469 let mut centroid = vec![0.0; dim];
470
471 for vec in vectors {
472 for (i, &val) in vec.iter().enumerate() {
473 centroid[i] += val;
474 }
475 }
476
477 for val in &mut centroid {
478 *val /= n;
479 }
480
481 centroid
482 }
483
484 fn compute_variance(&self, vectors: &[Vec<f32>], centroid: &[f32]) -> f32 {
485 if vectors.is_empty() {
486 return 0.0;
487 }
488
489 let mut total_variance = 0.0;
490 for vec in vectors {
491 let dist = self.compute_distance(vec, centroid);
492 total_variance += dist * dist;
493 }
494
495 total_variance / vectors.len() as f32
496 }
497
498 fn compute_distance(&self, a: &[f32], b: &[f32]) -> f32 {
499 match self.config.parameters.metric {
500 DistanceMetric::Cosine => {
501 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
502 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
503 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
504 if norm_a == 0.0 || norm_b == 0.0 {
505 1.0
506 } else {
507 1.0 - (dot / (norm_a * norm_b))
508 }
509 }
510 DistanceMetric::Euclidean => {
511 a.iter()
512 .zip(b.iter())
513 .map(|(x, y)| (x - y).powi(2))
514 .sum::<f32>()
515 .sqrt()
516 }
517 DistanceMetric::Manhattan => {
518 a.iter()
519 .zip(b.iter())
520 .map(|(x, y)| (x - y).abs())
521 .sum()
522 }
523 DistanceMetric::Poincare => {
524 let euclidean: f32 = a
526 .iter()
527 .zip(b.iter())
528 .map(|(x, y)| (x - y).powi(2))
529 .sum::<f32>()
530 .sqrt();
531 euclidean }
533 }
534 }
535
536 fn clusters_to_labels(
537 &self,
538 clusters: &[Cluster],
539 embeddings: &[EmbeddingWithId],
540 ) -> Vec<i32> {
541 let mut labels = vec![-1i32; embeddings.len()];
542 let id_to_idx: HashMap<EmbeddingId, usize> = embeddings
543 .iter()
544 .enumerate()
545 .map(|(i, (id, _))| (*id, i))
546 .collect();
547
548 for (cluster_idx, cluster) in clusters.iter().enumerate() {
549 for member_id in &cluster.member_ids {
550 if let Some(&idx) = id_to_idx.get(member_id) {
551 labels[idx] = cluster_idx as i32;
552 }
553 }
554 }
555
556 labels
557 }
558
559 fn embeddings_to_matrix(&self, embeddings: &[EmbeddingWithId]) -> Array2<f32> {
560 if embeddings.is_empty() {
561 return Array2::zeros((0, 0));
562 }
563
564 let dim = embeddings[0].1.len();
565 let n = embeddings.len();
566 let mut matrix = Array2::<f32>::zeros((n, dim));
567
568 for (i, (_, vec)) in embeddings.iter().enumerate() {
569 for (j, &val) in vec.iter().enumerate() {
570 matrix[[i, j]] = val;
571 }
572 }
573
574 matrix
575 }
576}
577
578pub struct MotifDetectionService {
580 config: MotifConfig,
582}
583
584impl MotifDetectionService {
585 #[must_use]
587 pub fn new(config: MotifConfig) -> Self {
588 Self { config }
589 }
590
591 #[must_use]
593 pub fn default_service() -> Self {
594 Self::new(MotifConfig::default())
595 }
596
597 #[instrument(skip(self, sequences), fields(n_sequences = sequences.len()))]
608 pub async fn detect_motifs(
609 &self,
610 sequences: &[Vec<ClusterId>],
611 min_length: usize,
612 ) -> Result<Vec<Motif>> {
613 if sequences.is_empty() {
614 return Ok(Vec::new());
615 }
616
617 let effective_min_length = min_length.max(self.config.min_length);
618 let effective_max_length = self.config.max_length;
619
620 info!(
621 n_sequences = sequences.len(),
622 min_length = effective_min_length,
623 max_length = effective_max_length,
624 "Starting motif detection"
625 );
626
627 let mut all_motifs: HashMap<Vec<ClusterId>, Vec<(usize, usize)>> = HashMap::new();
628
629 for (seq_idx, sequence) in sequences.iter().enumerate() {
631 for length in effective_min_length..=effective_max_length {
632 if sequence.len() < length {
633 continue;
634 }
635
636 for start in 0..=(sequence.len() - length) {
637 let subsequence: Vec<ClusterId> = sequence[start..start + length].to_vec();
638 all_motifs
639 .entry(subsequence)
640 .or_default()
641 .push((seq_idx, start));
642 }
643 }
644 }
645
646 let motifs: Vec<Motif> = all_motifs
648 .into_iter()
649 .filter(|(_, occurrences)| occurrences.len() >= self.config.min_occurrences)
650 .filter_map(|(sequence, occurrences)| {
651 let n_occurrences = occurrences.len();
652 let confidence = n_occurrences as f32 / sequences.len() as f32;
653
654 if confidence < self.config.min_confidence {
655 return None;
656 }
657
658 let avg_duration_ms = (sequence.len() * 500) as f64; let mut motif = Motif::new(sequence, n_occurrences, avg_duration_ms, confidence);
662
663 for (_seq_idx, start) in occurrences {
665 motif.add_occurrence(MotifOccurrence::new(
666 RecordingId::new(),
667 Vec::new(),
668 (start * 500) as u64,
669 ((start + motif.length()) * 500) as u64,
670 1.0,
671 ));
672 }
673
674 Some(motif)
675 })
676 .collect();
677
678 info!(n_motifs = motifs.len(), "Motif detection completed");
679
680 Ok(motifs)
681 }
682
683 #[instrument(skip(self, sequences))]
693 pub async fn compute_transition_matrix(
694 &self,
695 sequences: &[Vec<ClusterId>],
696 ) -> Result<TransitionMatrix> {
697 let mut all_clusters: std::collections::HashSet<ClusterId> = std::collections::HashSet::new();
699 for sequence in sequences {
700 for cluster in sequence {
701 all_clusters.insert(*cluster);
702 }
703 }
704
705 let cluster_ids: Vec<ClusterId> = all_clusters.into_iter().collect();
706 let mut matrix = TransitionMatrix::new(cluster_ids);
707
708 for sequence in sequences {
710 for window in sequence.windows(2) {
711 matrix.record_transition(&window[0], &window[1]);
712 }
713 }
714
715 matrix.compute_probabilities();
717
718 debug!(
719 n_clusters = matrix.size(),
720 n_transitions = matrix.non_zero_transitions().len(),
721 "Computed transition matrix"
722 );
723
724 Ok(matrix)
725 }
726
727 #[instrument(skip(self, motif, sequences))]
729 pub async fn find_motif_occurrences(
730 &self,
731 motif: &Motif,
732 sequences: &[(RecordingId, Vec<ClusterId>)],
733 ) -> Result<Vec<MotifOccurrence>> {
734 let pattern = &motif.sequence;
735 let mut occurrences = Vec::new();
736
737 for (recording_id, sequence) in sequences {
738 if sequence.len() < pattern.len() {
739 continue;
740 }
741
742 for start in 0..=(sequence.len() - pattern.len()) {
743 let subsequence = &sequence[start..start + pattern.len()];
744 if subsequence == pattern.as_slice() {
745 occurrences.push(MotifOccurrence::new(
746 *recording_id,
747 Vec::new(),
748 (start * 500) as u64,
749 ((start + pattern.len()) * 500) as u64,
750 1.0,
751 ));
752 }
753 }
754 }
755
756 Ok(occurrences)
757 }
758}
759
760pub struct SequenceAnalysisService {
762 analyzer: MarkovAnalyzer,
764}
765
766impl SequenceAnalysisService {
767 #[must_use]
769 pub fn new() -> Self {
770 Self {
771 analyzer: MarkovAnalyzer::new(),
772 }
773 }
774
775 #[instrument(skip(self, segment_ids, cluster_assignments))]
787 pub async fn analyze_sequence(
788 &self,
789 segment_ids: &[SegmentId],
790 cluster_assignments: &HashMap<SegmentId, ClusterId>,
791 recording_id: RecordingId,
792 ) -> Result<SequenceAnalysis> {
793 if segment_ids.is_empty() {
794 return Err(AnalysisError::InsufficientData(
795 "Empty segment sequence".to_string(),
796 ));
797 }
798
799 let cluster_sequence: Vec<ClusterId> = segment_ids
801 .iter()
802 .filter_map(|seg_id| cluster_assignments.get(seg_id).copied())
803 .collect();
804
805 if cluster_sequence.len() < 2 {
806 return Err(AnalysisError::InsufficientData(
807 "Need at least 2 segments for sequence analysis".to_string(),
808 ));
809 }
810
811 let mut transitions: HashMap<(ClusterId, ClusterId), u32> = HashMap::new();
813 for window in cluster_sequence.windows(2) {
814 *transitions.entry((window[0], window[1])).or_insert(0) += 1;
815 }
816
817 let total_transitions = transitions.values().sum::<u32>() as f32;
819 let transition_probs: Vec<(ClusterId, ClusterId, f32)> = transitions
820 .into_iter()
821 .map(|((from, to), count)| (from, to, count as f32 / total_transitions))
822 .collect();
823
824 let entropy = self.compute_entropy(&transition_probs);
826
827 let n_unique_clusters = cluster_sequence
829 .iter()
830 .collect::<std::collections::HashSet<_>>()
831 .len();
832 let max_entropy = (n_unique_clusters as f32).ln();
833 let normalized_entropy = if max_entropy > 0.0 {
834 entropy / max_entropy
835 } else {
836 0.0
837 };
838 let stereotypy_score = 1.0 - normalized_entropy;
839
840 let mut analysis = SequenceAnalysis::new(
841 recording_id,
842 transition_probs,
843 entropy,
844 stereotypy_score,
845 );
846 analysis.set_sequence(cluster_sequence, segment_ids.to_vec());
847
848 info!(
849 recording_id = %recording_id,
850 entropy = entropy,
851 stereotypy = stereotypy_score,
852 "Sequence analysis completed"
853 );
854
855 Ok(analysis)
856 }
857
858 #[must_use]
860 pub fn compute_entropy(&self, transitions: &[(ClusterId, ClusterId, f32)]) -> f32 {
861 self.analyzer.compute_entropy(transitions)
862 }
863
864 #[instrument(skip(self, cluster_sequence))]
866 pub async fn compute_metrics(
867 &self,
868 cluster_sequence: &[ClusterId],
869 ) -> Result<SequenceMetrics> {
870 if cluster_sequence.len() < 2 {
871 return Ok(SequenceMetrics::default());
872 }
873
874 let mut transitions: HashMap<(ClusterId, ClusterId), u32> = HashMap::new();
876 let mut self_transitions = 0u32;
877
878 for window in cluster_sequence.windows(2) {
879 *transitions.entry((window[0], window[1])).or_insert(0) += 1;
880 if window[0] == window[1] {
881 self_transitions += 1;
882 }
883 }
884
885 let total_transitions = (cluster_sequence.len() - 1) as u32;
886 let unique_clusters: std::collections::HashSet<_> = cluster_sequence.iter().collect();
887
888 let transition_probs: Vec<(ClusterId, ClusterId, f32)> = transitions
890 .iter()
891 .map(|(&(from, to), &count)| (from, to, count as f32 / total_transitions as f32))
892 .collect();
893
894 let entropy = self.compute_entropy(&transition_probs);
895 let max_entropy = (unique_clusters.len() as f32).ln().max(1.0);
896 let normalized_entropy = entropy / max_entropy;
897
898 let dominant_transition = transition_probs
900 .iter()
901 .max_by(|a, b| a.2.partial_cmp(&b.2).unwrap_or(std::cmp::Ordering::Equal))
902 .map(|&(from, to, prob)| (from, to, prob));
903
904 Ok(SequenceMetrics {
905 entropy,
906 normalized_entropy,
907 stereotypy: 1.0 - normalized_entropy,
908 unique_clusters: unique_clusters.len(),
909 unique_transitions: transitions.len(),
910 total_transitions: total_transitions as usize,
911 dominant_transition,
912 repetition_rate: self_transitions as f32 / total_transitions as f32,
913 })
914 }
915}
916
917impl Default for SequenceAnalysisService {
918 fn default() -> Self {
919 Self::new()
920 }
921}
922
923pub struct AnomalyDetectionService {
925 threshold: f32,
927
928 k_neighbors: usize,
930}
931
932impl AnomalyDetectionService {
933 #[must_use]
940 pub fn new(threshold: f32, k_neighbors: usize) -> Self {
941 Self {
942 threshold,
943 k_neighbors,
944 }
945 }
946
947 #[must_use]
949 pub fn default_service() -> Self {
950 Self::new(0.5, 20)
951 }
952
953 #[instrument(skip(self, embeddings, clusters))]
964 pub async fn detect_anomalies(
965 &self,
966 embeddings: &[EmbeddingWithId],
967 clusters: &[Cluster],
968 ) -> Result<Vec<Anomaly>> {
969 if clusters.is_empty() {
970 return Ok(Vec::new());
971 }
972
973 let mut anomalies = Vec::new();
974
975 let assigned: std::collections::HashSet<EmbeddingId> = clusters
977 .iter()
978 .flat_map(|c| c.member_ids.iter())
979 .copied()
980 .collect();
981
982 for (embedding_id, vector) in embeddings {
983 let (nearest_cluster, distance) = clusters
985 .iter()
986 .map(|c| {
987 let dist = self.cosine_distance(vector, &c.centroid);
988 (c.id, dist)
989 })
990 .min_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
991 .unwrap();
992
993 let in_cluster = assigned.contains(embedding_id);
995 let anomaly_score = if in_cluster {
996 (distance * 2.0).min(1.0)
998 } else {
999 (0.5 + distance).min(1.0)
1001 };
1002
1003 if anomaly_score > self.threshold {
1004 let mut anomaly = Anomaly::new(
1005 *embedding_id,
1006 anomaly_score,
1007 nearest_cluster,
1008 distance,
1009 );
1010
1011 if !in_cluster {
1013 anomaly.set_type(AnomalyType::Novel);
1014 } else if distance > 0.5 {
1015 anomaly.set_type(AnomalyType::Outlier);
1016 } else {
1017 anomaly.set_type(AnomalyType::Rare);
1018 }
1019
1020 anomalies.push(anomaly);
1021 }
1022 }
1023
1024 info!(
1025 n_anomalies = anomalies.len(),
1026 threshold = self.threshold,
1027 "Anomaly detection completed"
1028 );
1029
1030 Ok(anomalies)
1031 }
1032
1033 #[instrument(skip(self, embeddings))]
1035 pub async fn compute_lof(
1036 &self,
1037 embeddings: &[EmbeddingWithId],
1038 ) -> Result<HashMap<EmbeddingId, f32>> {
1039 if embeddings.len() <= self.k_neighbors {
1040 warn!(
1041 n_embeddings = embeddings.len(),
1042 k = self.k_neighbors,
1043 "Not enough embeddings for LOF computation"
1044 );
1045 return Ok(HashMap::new());
1046 }
1047
1048 let n = embeddings.len();
1049 let k = self.k_neighbors.min(n - 1);
1050
1051 let mut distances: Vec<Vec<(usize, f32)>> = Vec::with_capacity(n);
1053 for i in 0..n {
1054 let mut row: Vec<(usize, f32)> = Vec::with_capacity(n - 1);
1055 for j in 0..n {
1056 if i != j {
1057 let dist = self.cosine_distance(&embeddings[i].1, &embeddings[j].1);
1058 row.push((j, dist));
1059 }
1060 }
1061 row.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
1062 distances.push(row);
1063 }
1064
1065 let k_distances: Vec<f32> = distances
1067 .iter()
1068 .map(|d| d.get(k - 1).map_or(f32::MAX, |x| x.1))
1069 .collect();
1070
1071 let mut lrd: Vec<f32> = vec![0.0; n];
1073 for i in 0..n {
1074 let mut reach_dist_sum = 0.0;
1075 for &(j, dist) in distances[i].iter().take(k) {
1076 reach_dist_sum += k_distances[j].max(dist);
1077 }
1078 lrd[i] = if reach_dist_sum > 0.0 {
1079 k as f32 / reach_dist_sum
1080 } else {
1081 f32::MAX
1082 };
1083 }
1084
1085 let mut lof_scores: HashMap<EmbeddingId, f32> = HashMap::new();
1087 for i in 0..n {
1088 let mut lof_sum = 0.0;
1089 for &(j, _) in distances[i].iter().take(k) {
1090 if lrd[i] > 0.0 {
1091 lof_sum += lrd[j] / lrd[i];
1092 }
1093 }
1094 let lof = lof_sum / k as f32;
1095 lof_scores.insert(embeddings[i].0, lof);
1096 }
1097
1098 Ok(lof_scores)
1099 }
1100
1101 #[must_use]
1103 pub fn classify_anomaly(
1104 &self,
1105 anomaly: &Anomaly,
1106 cluster_member_count: usize,
1107 ) -> AnomalyType {
1108 if anomaly.distance_to_centroid > 0.8 {
1109 AnomalyType::Artifact
1110 } else if cluster_member_count < 3 {
1111 AnomalyType::Rare
1112 } else if anomaly.local_outlier_factor.map_or(false, |lof| lof > 2.0) {
1113 AnomalyType::Outlier
1114 } else {
1115 AnomalyType::Novel
1116 }
1117 }
1118
1119 fn cosine_distance(&self, a: &[f32], b: &[f32]) -> f32 {
1120 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
1121 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
1122 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
1123 if norm_a == 0.0 || norm_b == 0.0 {
1124 1.0
1125 } else {
1126 1.0 - (dot / (norm_a * norm_b))
1127 }
1128 }
1129}
1130
1131impl Default for AnomalyDetectionService {
1132 fn default() -> Self {
1133 Self::default_service()
1134 }
1135}
1136
1137#[cfg(test)]
1138mod tests {
1139 use super::*;
1140
1141 fn create_test_embeddings(n: usize, dim: usize) -> Vec<EmbeddingWithId> {
1142 (0..n)
1143 .map(|i| {
1144 let vec: Vec<f32> = (0..dim)
1145 .map(|j| ((i * dim + j) as f32 * 0.01).sin())
1146 .collect();
1147 (EmbeddingId::new(), vec)
1148 })
1149 .collect()
1150 }
1151
1152 #[tokio::test]
1153 async fn test_clustering_service_insufficient_data() {
1154 let service = ClusteringService::default_service();
1155 let result = service.run_hdbscan(&[]).await;
1156 assert!(result.is_err());
1157 }
1158
1159 #[tokio::test]
1160 async fn test_motif_detection_empty_sequences() {
1161 let service = MotifDetectionService::default_service();
1162 let motifs = service.detect_motifs(&[], 2).await.unwrap();
1163 assert!(motifs.is_empty());
1164 }
1165
1166 #[tokio::test]
1167 async fn test_motif_detection_finds_patterns() {
1168 let service = MotifDetectionService::new(MotifConfig {
1169 min_length: 2,
1170 max_length: 5,
1171 min_occurrences: 2,
1172 min_confidence: 0.0,
1173 allow_overlap: false,
1174 max_gap: 0,
1175 });
1176
1177 let c1 = ClusterId::new();
1178 let c2 = ClusterId::new();
1179 let c3 = ClusterId::new();
1180
1181 let sequences = vec![
1182 vec![c1, c2, c3, c1, c2, c3],
1183 vec![c1, c2, c3, c2, c1],
1184 vec![c2, c1, c2, c3, c1, c2],
1185 ];
1186
1187 let motifs = service.detect_motifs(&sequences, 2).await.unwrap();
1188 assert!(!motifs.is_empty());
1189
1190 let has_c1_c2 = motifs.iter().any(|m| m.sequence == vec![c1, c2]);
1192 assert!(has_c1_c2);
1193 }
1194
1195 #[tokio::test]
1196 async fn test_sequence_analysis_computes_entropy() {
1197 let service = SequenceAnalysisService::new();
1198
1199 let c1 = ClusterId::new();
1200 let c2 = ClusterId::new();
1201
1202 let metrics = service
1203 .compute_metrics(&[c1, c2, c1, c2, c1, c2])
1204 .await
1205 .unwrap();
1206
1207 assert!(metrics.entropy > 0.0);
1208 assert!(metrics.stereotypy >= 0.0 && metrics.stereotypy <= 1.0);
1209 }
1210
1211 #[tokio::test]
1212 async fn test_anomaly_detection_empty_clusters() {
1213 let service = AnomalyDetectionService::default_service();
1214 let embeddings = create_test_embeddings(10, 16);
1215 let anomalies = service.detect_anomalies(&embeddings, &[]).await.unwrap();
1216 assert!(anomalies.is_empty());
1217 }
1218
1219 #[test]
1220 fn test_cosine_distance() {
1221 let service = AnomalyDetectionService::default_service();
1222
1223 let a = vec![1.0, 0.0, 0.0];
1224 let b = vec![1.0, 0.0, 0.0];
1225 assert!((service.cosine_distance(&a, &b) - 0.0).abs() < 0.001);
1226
1227 let c = vec![0.0, 1.0, 0.0];
1228 assert!((service.cosine_distance(&a, &c) - 1.0).abs() < 0.001);
1229 }
1230}