sevensense_analysis/application/
services.rs

1//! Application services for analysis operations.
2//!
3//! These services orchestrate domain operations and coordinate with
4//! infrastructure components to perform clustering, motif detection,
5//! sequence analysis, and anomaly detection.
6
7use std::collections::HashMap;
8
9use ndarray::Array2;
10use thiserror::Error;
11use tracing::{debug, info, instrument, warn};
12
13use crate::domain::entities::{
14    Anomaly, AnomalyType, Cluster, ClusterId, EmbeddingId, Motif, MotifOccurrence, Prototype,
15    RecordingId, SegmentId, SequenceAnalysis,
16};
17use crate::domain::value_objects::{
18    ClusteringConfig, ClusteringMethod, ClusteringResult, DistanceMetric, MotifConfig,
19    SequenceMetrics, TransitionMatrix,
20};
21use crate::infrastructure::{HdbscanClusterer, KMeansClusterer, MarkovAnalyzer};
22use crate::metrics::SilhouetteScore;
23
24/// Errors that can occur in analysis services.
25#[derive(Debug, Error)]
26pub enum AnalysisError {
27    /// Insufficient data for analysis.
28    #[error("Insufficient data: {0}")]
29    InsufficientData(String),
30
31    /// Invalid configuration.
32    #[error("Invalid configuration: {0}")]
33    InvalidConfig(String),
34
35    /// Clustering failed.
36    #[error("Clustering failed: {0}")]
37    ClusteringFailed(String),
38
39    /// Motif detection failed.
40    #[error("Motif detection failed: {0}")]
41    MotifDetectionFailed(String),
42
43    /// Sequence analysis failed.
44    #[error("Sequence analysis failed: {0}")]
45    SequenceAnalysisFailed(String),
46
47    /// Anomaly detection failed.
48    #[error("Anomaly detection failed: {0}")]
49    AnomalyDetectionFailed(String),
50
51    /// Internal computation error.
52    #[error("Computation error: {0}")]
53    ComputationError(String),
54
55    /// Infrastructure error.
56    #[error("Infrastructure error: {0}")]
57    Infrastructure(String),
58}
59
60/// Result type for analysis operations.
61pub type Result<T> = std::result::Result<T, AnalysisError>;
62
63/// Embedding data with ID.
64pub type EmbeddingWithId = (EmbeddingId, Vec<f32>);
65
66/// Service for clustering embeddings.
67///
68/// Supports HDBSCAN and K-means clustering algorithms with automatic
69/// prototype computation and cluster quality metrics.
70pub struct ClusteringService {
71    /// Clustering configuration.
72    config: ClusteringConfig,
73}
74
75impl ClusteringService {
76    /// Create a new clustering service with the given configuration.
77    #[must_use]
78    pub fn new(config: ClusteringConfig) -> Self {
79        Self { config }
80    }
81
82    /// Create with default configuration.
83    #[must_use]
84    pub fn default_service() -> Self {
85        Self::new(ClusteringConfig::default())
86    }
87
88    /// Run HDBSCAN clustering on the provided embeddings.
89    ///
90    /// # Arguments
91    ///
92    /// * `embeddings` - Slice of (EmbeddingId, vector) tuples
93    ///
94    /// # Returns
95    ///
96    /// A vector of discovered clusters.
97    #[instrument(skip(self, embeddings), fields(n_embeddings = embeddings.len()))]
98    pub async fn run_hdbscan(
99        &self,
100        embeddings: &[EmbeddingWithId],
101    ) -> Result<Vec<Cluster>> {
102        if embeddings.is_empty() {
103            return Err(AnalysisError::InsufficientData(
104                "Cannot cluster empty embedding set".to_string(),
105            ));
106        }
107
108        if embeddings.len() < self.config.parameters.min_cluster_size {
109            return Err(AnalysisError::InsufficientData(format!(
110                "Need at least {} embeddings for HDBSCAN, got {}",
111                self.config.parameters.min_cluster_size,
112                embeddings.len()
113            )));
114        }
115
116        info!(
117            n_embeddings = embeddings.len(),
118            min_cluster_size = self.config.parameters.min_cluster_size,
119            min_samples = self.config.parameters.min_samples,
120            "Starting HDBSCAN clustering"
121        );
122
123        // Build embedding matrix
124        let dim = embeddings[0].1.len();
125        let n = embeddings.len();
126        let mut matrix = Array2::<f32>::zeros((n, dim));
127
128        for (i, (_, vec)) in embeddings.iter().enumerate() {
129            if vec.len() != dim {
130                return Err(AnalysisError::InvalidConfig(format!(
131                    "Embedding dimension mismatch: expected {}, got {}",
132                    dim,
133                    vec.len()
134                )));
135            }
136            for (j, &val) in vec.iter().enumerate() {
137                matrix[[i, j]] = val;
138            }
139        }
140
141        // Run HDBSCAN
142        let clusterer = HdbscanClusterer::new(
143            self.config.parameters.min_cluster_size,
144            self.config.parameters.min_samples,
145            self.config.parameters.metric,
146        );
147
148        let labels = clusterer.fit(&matrix)?;
149
150        // Convert labels to clusters
151        let clusters = self.labels_to_clusters(embeddings, &labels)?;
152
153        info!(
154            n_clusters = clusters.len(),
155            "HDBSCAN clustering completed"
156        );
157
158        Ok(clusters)
159    }
160
161    /// Run K-means clustering on the provided embeddings.
162    ///
163    /// # Arguments
164    ///
165    /// * `embeddings` - Slice of (EmbeddingId, vector) tuples
166    /// * `k` - Number of clusters
167    ///
168    /// # Returns
169    ///
170    /// A vector of k clusters.
171    #[instrument(skip(self, embeddings), fields(n_embeddings = embeddings.len(), k = k))]
172    pub async fn run_kmeans(
173        &self,
174        embeddings: &[EmbeddingWithId],
175        k: usize,
176    ) -> Result<Vec<Cluster>> {
177        if embeddings.is_empty() {
178            return Err(AnalysisError::InsufficientData(
179                "Cannot cluster empty embedding set".to_string(),
180            ));
181        }
182
183        if embeddings.len() < k {
184            return Err(AnalysisError::InsufficientData(format!(
185                "Need at least {} embeddings for K-means with k={}, got {}",
186                k,
187                k,
188                embeddings.len()
189            )));
190        }
191
192        info!(n_embeddings = embeddings.len(), k = k, "Starting K-means clustering");
193
194        // Build embedding matrix
195        let dim = embeddings[0].1.len();
196        let n = embeddings.len();
197        let mut matrix = Array2::<f32>::zeros((n, dim));
198
199        for (i, (_, vec)) in embeddings.iter().enumerate() {
200            for (j, &val) in vec.iter().enumerate() {
201                matrix[[i, j]] = val;
202            }
203        }
204
205        // Run K-means
206        let clusterer = KMeansClusterer::new(k, self.config.random_seed);
207        let (labels, centroids) = clusterer.fit(&matrix)?;
208
209        // Convert labels to clusters with known centroids
210        let clusters = self.labels_to_clusters_with_centroids(
211            embeddings,
212            &labels,
213            &centroids,
214        )?;
215
216        info!(n_clusters = clusters.len(), "K-means clustering completed");
217
218        Ok(clusters)
219    }
220
221    /// Assign an embedding to the nearest cluster.
222    ///
223    /// # Arguments
224    ///
225    /// * `embedding` - The embedding vector to assign
226    /// * `clusters` - Available clusters to assign to
227    ///
228    /// # Returns
229    ///
230    /// The ID of the nearest cluster.
231    #[instrument(skip(self, embedding, clusters), fields(n_clusters = clusters.len()))]
232    pub async fn assign_to_nearest(
233        &self,
234        embedding: &[f32],
235        clusters: &[Cluster],
236    ) -> Result<ClusterId> {
237        if clusters.is_empty() {
238            return Err(AnalysisError::InsufficientData(
239                "No clusters available for assignment".to_string(),
240            ));
241        }
242
243        let mut min_distance = f32::MAX;
244        let mut nearest_cluster = clusters[0].id;
245
246        for cluster in clusters {
247            let distance = self.compute_distance(embedding, &cluster.centroid);
248            if distance < min_distance {
249                min_distance = distance;
250                nearest_cluster = cluster.id;
251            }
252        }
253
254        debug!(
255            cluster_id = %nearest_cluster,
256            distance = min_distance,
257            "Assigned embedding to nearest cluster"
258        );
259
260        Ok(nearest_cluster)
261    }
262
263    /// Compute prototypes (exemplars) for a cluster.
264    ///
265    /// # Arguments
266    ///
267    /// * `cluster` - The cluster to compute prototypes for
268    /// * `embeddings` - All embeddings with their IDs
269    ///
270    /// # Returns
271    ///
272    /// A vector of prototypes for the cluster.
273    #[instrument(skip(self, cluster, embeddings), fields(cluster_id = %cluster.id))]
274    pub async fn compute_prototypes(
275        &self,
276        cluster: &Cluster,
277        embeddings: &HashMap<EmbeddingId, Vec<f32>>,
278    ) -> Result<Vec<Prototype>> {
279        if cluster.member_ids.is_empty() {
280            return Ok(Vec::new());
281        }
282
283        let n_prototypes = self.config.prototypes_per_cluster.min(cluster.member_ids.len());
284        let mut scored_members: Vec<(EmbeddingId, f32)> = Vec::new();
285
286        // Score each member by distance to centroid (lower = better exemplar)
287        for &member_id in &cluster.member_ids {
288            if let Some(vec) = embeddings.get(&member_id) {
289                let distance = self.compute_distance(vec, &cluster.centroid);
290                let score = 1.0 / (1.0 + distance); // Higher score = closer to centroid
291                scored_members.push((member_id, score));
292            }
293        }
294
295        // Sort by score descending
296        scored_members.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
297
298        // Take top prototypes
299        let prototypes: Vec<Prototype> = scored_members
300            .into_iter()
301            .take(n_prototypes)
302            .map(|(id, score)| Prototype::new(id, cluster.id, score))
303            .collect();
304
305        debug!(
306            cluster_id = %cluster.id,
307            n_prototypes = prototypes.len(),
308            "Computed cluster prototypes"
309        );
310
311        Ok(prototypes)
312    }
313
314    /// Run full clustering pipeline with metrics.
315    #[instrument(skip(self, embeddings))]
316    pub async fn cluster_with_metrics(
317        &self,
318        embeddings: &[EmbeddingWithId],
319    ) -> Result<ClusteringResult> {
320        let clusters = match &self.config.method {
321            ClusteringMethod::HDBSCAN => self.run_hdbscan(embeddings).await?,
322            ClusteringMethod::KMeans { k } => self.run_kmeans(embeddings, *k).await?,
323            _ => {
324                return Err(AnalysisError::InvalidConfig(
325                    "Unsupported clustering method".to_string(),
326                ))
327            }
328        };
329
330        // Identify noise points (not in any cluster)
331        let assigned: std::collections::HashSet<_> = clusters
332            .iter()
333            .flat_map(|c| c.member_ids.iter())
334            .copied()
335            .collect();
336
337        let noise: Vec<EmbeddingId> = embeddings
338            .iter()
339            .map(|(id, _)| *id)
340            .filter(|id| !assigned.contains(id))
341            .collect();
342
343        // Compute silhouette score if configured
344        let silhouette_score = if self.config.compute_silhouette && !clusters.is_empty() {
345            let labels = self.clusters_to_labels(&clusters, embeddings);
346            let matrix = self.embeddings_to_matrix(embeddings);
347            Some(SilhouetteScore::new().compute(&matrix, &labels))
348        } else {
349            None
350        };
351
352        // Compute prototypes
353        let embedding_map: HashMap<EmbeddingId, Vec<f32>> = embeddings
354            .iter()
355            .map(|(id, vec)| (*id, vec.clone()))
356            .collect();
357
358        let mut prototypes = Vec::new();
359        if self.config.compute_prototypes {
360            for cluster in &clusters {
361                let cluster_prototypes = self.compute_prototypes(cluster, &embedding_map).await?;
362                prototypes.extend(cluster_prototypes);
363            }
364        }
365
366        Ok(ClusteringResult {
367            clusters,
368            noise,
369            silhouette_score,
370            v_measure: None,
371            prototypes,
372            parameters: self.config.parameters.clone(),
373            method: self.config.method.clone(),
374        })
375    }
376
377    // Helper methods
378
379    fn labels_to_clusters(
380        &self,
381        embeddings: &[EmbeddingWithId],
382        labels: &[i32],
383    ) -> Result<Vec<Cluster>> {
384        let mut cluster_members: HashMap<i32, Vec<EmbeddingId>> = HashMap::new();
385        let mut cluster_vectors: HashMap<i32, Vec<Vec<f32>>> = HashMap::new();
386
387        for ((id, vec), &label) in embeddings.iter().zip(labels.iter()) {
388            if label >= 0 {
389                cluster_members.entry(label).or_default().push(*id);
390                cluster_vectors.entry(label).or_default().push(vec.clone());
391            }
392        }
393
394        let mut clusters = Vec::new();
395        for (label, member_ids) in cluster_members {
396            let vectors = &cluster_vectors[&label];
397            let centroid = self.compute_centroid(vectors);
398            let variance = self.compute_variance(vectors, &centroid);
399
400            let prototype_id = member_ids
401                .iter()
402                .min_by(|a, b| {
403                    let idx_a = embeddings.iter().position(|(id, _)| id == *a).unwrap();
404                    let idx_b = embeddings.iter().position(|(id, _)| id == *b).unwrap();
405                    let dist_a = self.compute_distance(&embeddings[idx_a].1, &centroid);
406                    let dist_b = self.compute_distance(&embeddings[idx_b].1, &centroid);
407                    dist_a.partial_cmp(&dist_b).unwrap_or(std::cmp::Ordering::Equal)
408                })
409                .copied()
410                .unwrap_or_else(EmbeddingId::new);
411
412            clusters.push(Cluster::new(prototype_id, member_ids, centroid, variance));
413        }
414
415        Ok(clusters)
416    }
417
418    fn labels_to_clusters_with_centroids(
419        &self,
420        embeddings: &[EmbeddingWithId],
421        labels: &[usize],
422        centroids: &Array2<f32>,
423    ) -> Result<Vec<Cluster>> {
424        let k = centroids.nrows();
425        let mut cluster_members: Vec<Vec<EmbeddingId>> = vec![Vec::new(); k];
426        let mut cluster_vectors: Vec<Vec<Vec<f32>>> = vec![Vec::new(); k];
427
428        for ((id, vec), &label) in embeddings.iter().zip(labels.iter()) {
429            if label < k {
430                cluster_members[label].push(*id);
431                cluster_vectors[label].push(vec.clone());
432            }
433        }
434
435        let mut clusters = Vec::new();
436        for (i, member_ids) in cluster_members.into_iter().enumerate() {
437            if member_ids.is_empty() {
438                continue;
439            }
440
441            let centroid: Vec<f32> = centroids.row(i).to_vec();
442            let variance = self.compute_variance(&cluster_vectors[i], &centroid);
443
444            let prototype_id = member_ids
445                .iter()
446                .min_by(|a, b| {
447                    let idx_a = embeddings.iter().position(|(id, _)| id == *a).unwrap();
448                    let idx_b = embeddings.iter().position(|(id, _)| id == *b).unwrap();
449                    let dist_a = self.compute_distance(&embeddings[idx_a].1, &centroid);
450                    let dist_b = self.compute_distance(&embeddings[idx_b].1, &centroid);
451                    dist_a.partial_cmp(&dist_b).unwrap_or(std::cmp::Ordering::Equal)
452                })
453                .copied()
454                .unwrap_or_else(EmbeddingId::new);
455
456            clusters.push(Cluster::new(prototype_id, member_ids, centroid, variance));
457        }
458
459        Ok(clusters)
460    }
461
462    fn compute_centroid(&self, vectors: &[Vec<f32>]) -> Vec<f32> {
463        if vectors.is_empty() {
464            return Vec::new();
465        }
466
467        let dim = vectors[0].len();
468        let n = vectors.len() as f32;
469        let mut centroid = vec![0.0; dim];
470
471        for vec in vectors {
472            for (i, &val) in vec.iter().enumerate() {
473                centroid[i] += val;
474            }
475        }
476
477        for val in &mut centroid {
478            *val /= n;
479        }
480
481        centroid
482    }
483
484    fn compute_variance(&self, vectors: &[Vec<f32>], centroid: &[f32]) -> f32 {
485        if vectors.is_empty() {
486            return 0.0;
487        }
488
489        let mut total_variance = 0.0;
490        for vec in vectors {
491            let dist = self.compute_distance(vec, centroid);
492            total_variance += dist * dist;
493        }
494
495        total_variance / vectors.len() as f32
496    }
497
498    fn compute_distance(&self, a: &[f32], b: &[f32]) -> f32 {
499        match self.config.parameters.metric {
500            DistanceMetric::Cosine => {
501                let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
502                let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
503                let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
504                if norm_a == 0.0 || norm_b == 0.0 {
505                    1.0
506                } else {
507                    1.0 - (dot / (norm_a * norm_b))
508                }
509            }
510            DistanceMetric::Euclidean => {
511                a.iter()
512                    .zip(b.iter())
513                    .map(|(x, y)| (x - y).powi(2))
514                    .sum::<f32>()
515                    .sqrt()
516            }
517            DistanceMetric::Manhattan => {
518                a.iter()
519                    .zip(b.iter())
520                    .map(|(x, y)| (x - y).abs())
521                    .sum()
522            }
523            DistanceMetric::Poincare => {
524                // Simplified Poincare distance approximation
525                let euclidean: f32 = a
526                    .iter()
527                    .zip(b.iter())
528                    .map(|(x, y)| (x - y).powi(2))
529                    .sum::<f32>()
530                    .sqrt();
531                euclidean // Placeholder - full implementation would use hyperbolic geometry
532            }
533        }
534    }
535
536    fn clusters_to_labels(
537        &self,
538        clusters: &[Cluster],
539        embeddings: &[EmbeddingWithId],
540    ) -> Vec<i32> {
541        let mut labels = vec![-1i32; embeddings.len()];
542        let id_to_idx: HashMap<EmbeddingId, usize> = embeddings
543            .iter()
544            .enumerate()
545            .map(|(i, (id, _))| (*id, i))
546            .collect();
547
548        for (cluster_idx, cluster) in clusters.iter().enumerate() {
549            for member_id in &cluster.member_ids {
550                if let Some(&idx) = id_to_idx.get(member_id) {
551                    labels[idx] = cluster_idx as i32;
552                }
553            }
554        }
555
556        labels
557    }
558
559    fn embeddings_to_matrix(&self, embeddings: &[EmbeddingWithId]) -> Array2<f32> {
560        if embeddings.is_empty() {
561            return Array2::zeros((0, 0));
562        }
563
564        let dim = embeddings[0].1.len();
565        let n = embeddings.len();
566        let mut matrix = Array2::<f32>::zeros((n, dim));
567
568        for (i, (_, vec)) in embeddings.iter().enumerate() {
569            for (j, &val) in vec.iter().enumerate() {
570                matrix[[i, j]] = val;
571            }
572        }
573
574        matrix
575    }
576}
577
578/// Service for detecting motif patterns in vocalization sequences.
579pub struct MotifDetectionService {
580    /// Motif detection configuration.
581    config: MotifConfig,
582}
583
584impl MotifDetectionService {
585    /// Create a new motif detection service.
586    #[must_use]
587    pub fn new(config: MotifConfig) -> Self {
588        Self { config }
589    }
590
591    /// Create with default configuration.
592    #[must_use]
593    pub fn default_service() -> Self {
594        Self::new(MotifConfig::default())
595    }
596
597    /// Detect motifs in cluster sequences.
598    ///
599    /// # Arguments
600    ///
601    /// * `sequences` - Collection of cluster sequences to analyze
602    /// * `min_length` - Minimum motif length (overrides config)
603    ///
604    /// # Returns
605    ///
606    /// A vector of detected motifs.
607    #[instrument(skip(self, sequences), fields(n_sequences = sequences.len()))]
608    pub async fn detect_motifs(
609        &self,
610        sequences: &[Vec<ClusterId>],
611        min_length: usize,
612    ) -> Result<Vec<Motif>> {
613        if sequences.is_empty() {
614            return Ok(Vec::new());
615        }
616
617        let effective_min_length = min_length.max(self.config.min_length);
618        let effective_max_length = self.config.max_length;
619
620        info!(
621            n_sequences = sequences.len(),
622            min_length = effective_min_length,
623            max_length = effective_max_length,
624            "Starting motif detection"
625        );
626
627        let mut all_motifs: HashMap<Vec<ClusterId>, Vec<(usize, usize)>> = HashMap::new();
628
629        // Extract all subsequences of valid lengths
630        for (seq_idx, sequence) in sequences.iter().enumerate() {
631            for length in effective_min_length..=effective_max_length {
632                if sequence.len() < length {
633                    continue;
634                }
635
636                for start in 0..=(sequence.len() - length) {
637                    let subsequence: Vec<ClusterId> = sequence[start..start + length].to_vec();
638                    all_motifs
639                        .entry(subsequence)
640                        .or_default()
641                        .push((seq_idx, start));
642                }
643            }
644        }
645
646        // Filter by minimum occurrences and confidence
647        let motifs: Vec<Motif> = all_motifs
648            .into_iter()
649            .filter(|(_, occurrences)| occurrences.len() >= self.config.min_occurrences)
650            .filter_map(|(sequence, occurrences)| {
651                let n_occurrences = occurrences.len();
652                let confidence = n_occurrences as f32 / sequences.len() as f32;
653
654                if confidence < self.config.min_confidence {
655                    return None;
656                }
657
658                // Estimate average duration (placeholder - would need segment timing data)
659                let avg_duration_ms = (sequence.len() * 500) as f64; // Rough estimate
660
661                let mut motif = Motif::new(sequence, n_occurrences, avg_duration_ms, confidence);
662
663                // Add occurrence instances (simplified - would need segment IDs)
664                for (_seq_idx, start) in occurrences {
665                    motif.add_occurrence(MotifOccurrence::new(
666                        RecordingId::new(),
667                        Vec::new(),
668                        (start * 500) as u64,
669                        ((start + motif.length()) * 500) as u64,
670                        1.0,
671                    ));
672                }
673
674                Some(motif)
675            })
676            .collect();
677
678        info!(n_motifs = motifs.len(), "Motif detection completed");
679
680        Ok(motifs)
681    }
682
683    /// Compute a transition matrix from cluster sequences.
684    ///
685    /// # Arguments
686    ///
687    /// * `sequences` - Collection of cluster sequences
688    ///
689    /// # Returns
690    ///
691    /// A transition matrix representing transition probabilities.
692    #[instrument(skip(self, sequences))]
693    pub async fn compute_transition_matrix(
694        &self,
695        sequences: &[Vec<ClusterId>],
696    ) -> Result<TransitionMatrix> {
697        // Collect all unique clusters
698        let mut all_clusters: std::collections::HashSet<ClusterId> = std::collections::HashSet::new();
699        for sequence in sequences {
700            for cluster in sequence {
701                all_clusters.insert(*cluster);
702            }
703        }
704
705        let cluster_ids: Vec<ClusterId> = all_clusters.into_iter().collect();
706        let mut matrix = TransitionMatrix::new(cluster_ids);
707
708        // Record transitions
709        for sequence in sequences {
710            for window in sequence.windows(2) {
711                matrix.record_transition(&window[0], &window[1]);
712            }
713        }
714
715        // Compute probabilities
716        matrix.compute_probabilities();
717
718        debug!(
719            n_clusters = matrix.size(),
720            n_transitions = matrix.non_zero_transitions().len(),
721            "Computed transition matrix"
722        );
723
724        Ok(matrix)
725    }
726
727    /// Find occurrences of a specific motif in sequences.
728    #[instrument(skip(self, motif, sequences))]
729    pub async fn find_motif_occurrences(
730        &self,
731        motif: &Motif,
732        sequences: &[(RecordingId, Vec<ClusterId>)],
733    ) -> Result<Vec<MotifOccurrence>> {
734        let pattern = &motif.sequence;
735        let mut occurrences = Vec::new();
736
737        for (recording_id, sequence) in sequences {
738            if sequence.len() < pattern.len() {
739                continue;
740            }
741
742            for start in 0..=(sequence.len() - pattern.len()) {
743                let subsequence = &sequence[start..start + pattern.len()];
744                if subsequence == pattern.as_slice() {
745                    occurrences.push(MotifOccurrence::new(
746                        *recording_id,
747                        Vec::new(),
748                        (start * 500) as u64,
749                        ((start + pattern.len()) * 500) as u64,
750                        1.0,
751                    ));
752                }
753            }
754        }
755
756        Ok(occurrences)
757    }
758}
759
760/// Service for analyzing vocalization sequences.
761pub struct SequenceAnalysisService {
762    /// Markov chain analyzer.
763    analyzer: MarkovAnalyzer,
764}
765
766impl SequenceAnalysisService {
767    /// Create a new sequence analysis service.
768    #[must_use]
769    pub fn new() -> Self {
770        Self {
771            analyzer: MarkovAnalyzer::new(),
772        }
773    }
774
775    /// Analyze a sequence of segments with their cluster assignments.
776    ///
777    /// # Arguments
778    ///
779    /// * `segment_ids` - Ordered segment IDs from a recording
780    /// * `cluster_assignments` - Mapping from segment ID to cluster ID
781    /// * `recording_id` - The recording being analyzed
782    ///
783    /// # Returns
784    ///
785    /// A SequenceAnalysis with entropy and stereotypy metrics.
786    #[instrument(skip(self, segment_ids, cluster_assignments))]
787    pub async fn analyze_sequence(
788        &self,
789        segment_ids: &[SegmentId],
790        cluster_assignments: &HashMap<SegmentId, ClusterId>,
791        recording_id: RecordingId,
792    ) -> Result<SequenceAnalysis> {
793        if segment_ids.is_empty() {
794            return Err(AnalysisError::InsufficientData(
795                "Empty segment sequence".to_string(),
796            ));
797        }
798
799        // Build cluster sequence
800        let cluster_sequence: Vec<ClusterId> = segment_ids
801            .iter()
802            .filter_map(|seg_id| cluster_assignments.get(seg_id).copied())
803            .collect();
804
805        if cluster_sequence.len() < 2 {
806            return Err(AnalysisError::InsufficientData(
807                "Need at least 2 segments for sequence analysis".to_string(),
808            ));
809        }
810
811        // Compute transition matrix
812        let mut transitions: HashMap<(ClusterId, ClusterId), u32> = HashMap::new();
813        for window in cluster_sequence.windows(2) {
814            *transitions.entry((window[0], window[1])).or_insert(0) += 1;
815        }
816
817        // Convert to probability transitions
818        let total_transitions = transitions.values().sum::<u32>() as f32;
819        let transition_probs: Vec<(ClusterId, ClusterId, f32)> = transitions
820            .into_iter()
821            .map(|((from, to), count)| (from, to, count as f32 / total_transitions))
822            .collect();
823
824        // Compute entropy
825        let entropy = self.compute_entropy(&transition_probs);
826
827        // Compute stereotypy (inverse of normalized entropy)
828        let n_unique_clusters = cluster_sequence
829            .iter()
830            .collect::<std::collections::HashSet<_>>()
831            .len();
832        let max_entropy = (n_unique_clusters as f32).ln();
833        let normalized_entropy = if max_entropy > 0.0 {
834            entropy / max_entropy
835        } else {
836            0.0
837        };
838        let stereotypy_score = 1.0 - normalized_entropy;
839
840        let mut analysis = SequenceAnalysis::new(
841            recording_id,
842            transition_probs,
843            entropy,
844            stereotypy_score,
845        );
846        analysis.set_sequence(cluster_sequence, segment_ids.to_vec());
847
848        info!(
849            recording_id = %recording_id,
850            entropy = entropy,
851            stereotypy = stereotypy_score,
852            "Sequence analysis completed"
853        );
854
855        Ok(analysis)
856    }
857
858    /// Compute Shannon entropy of transition probabilities.
859    #[must_use]
860    pub fn compute_entropy(&self, transitions: &[(ClusterId, ClusterId, f32)]) -> f32 {
861        self.analyzer.compute_entropy(transitions)
862    }
863
864    /// Compute sequence metrics from a cluster sequence.
865    #[instrument(skip(self, cluster_sequence))]
866    pub async fn compute_metrics(
867        &self,
868        cluster_sequence: &[ClusterId],
869    ) -> Result<SequenceMetrics> {
870        if cluster_sequence.len() < 2 {
871            return Ok(SequenceMetrics::default());
872        }
873
874        // Count transitions
875        let mut transitions: HashMap<(ClusterId, ClusterId), u32> = HashMap::new();
876        let mut self_transitions = 0u32;
877
878        for window in cluster_sequence.windows(2) {
879            *transitions.entry((window[0], window[1])).or_insert(0) += 1;
880            if window[0] == window[1] {
881                self_transitions += 1;
882            }
883        }
884
885        let total_transitions = (cluster_sequence.len() - 1) as u32;
886        let unique_clusters: std::collections::HashSet<_> = cluster_sequence.iter().collect();
887
888        // Compute probabilities and entropy
889        let transition_probs: Vec<(ClusterId, ClusterId, f32)> = transitions
890            .iter()
891            .map(|(&(from, to), &count)| (from, to, count as f32 / total_transitions as f32))
892            .collect();
893
894        let entropy = self.compute_entropy(&transition_probs);
895        let max_entropy = (unique_clusters.len() as f32).ln().max(1.0);
896        let normalized_entropy = entropy / max_entropy;
897
898        // Find dominant transition
899        let dominant_transition = transition_probs
900            .iter()
901            .max_by(|a, b| a.2.partial_cmp(&b.2).unwrap_or(std::cmp::Ordering::Equal))
902            .map(|&(from, to, prob)| (from, to, prob));
903
904        Ok(SequenceMetrics {
905            entropy,
906            normalized_entropy,
907            stereotypy: 1.0 - normalized_entropy,
908            unique_clusters: unique_clusters.len(),
909            unique_transitions: transitions.len(),
910            total_transitions: total_transitions as usize,
911            dominant_transition,
912            repetition_rate: self_transitions as f32 / total_transitions as f32,
913        })
914    }
915}
916
917impl Default for SequenceAnalysisService {
918    fn default() -> Self {
919        Self::new()
920    }
921}
922
923/// Service for detecting anomalous embeddings.
924pub struct AnomalyDetectionService {
925    /// Anomaly score threshold.
926    threshold: f32,
927
928    /// Number of neighbors to consider for local outlier factor.
929    k_neighbors: usize,
930}
931
932impl AnomalyDetectionService {
933    /// Create a new anomaly detection service.
934    ///
935    /// # Arguments
936    ///
937    /// * `threshold` - Score threshold above which points are considered anomalous
938    /// * `k_neighbors` - Number of neighbors for LOF computation
939    #[must_use]
940    pub fn new(threshold: f32, k_neighbors: usize) -> Self {
941        Self {
942            threshold,
943            k_neighbors,
944        }
945    }
946
947    /// Create with default settings.
948    #[must_use]
949    pub fn default_service() -> Self {
950        Self::new(0.5, 20)
951    }
952
953    /// Detect anomalies among embeddings based on cluster assignments.
954    ///
955    /// # Arguments
956    ///
957    /// * `embeddings` - All embeddings with IDs
958    /// * `clusters` - Discovered clusters
959    ///
960    /// # Returns
961    ///
962    /// A vector of detected anomalies.
963    #[instrument(skip(self, embeddings, clusters))]
964    pub async fn detect_anomalies(
965        &self,
966        embeddings: &[EmbeddingWithId],
967        clusters: &[Cluster],
968    ) -> Result<Vec<Anomaly>> {
969        if clusters.is_empty() {
970            return Ok(Vec::new());
971        }
972
973        let mut anomalies = Vec::new();
974
975        // Build a map of cluster members
976        let assigned: std::collections::HashSet<EmbeddingId> = clusters
977            .iter()
978            .flat_map(|c| c.member_ids.iter())
979            .copied()
980            .collect();
981
982        for (embedding_id, vector) in embeddings {
983            // Find nearest cluster
984            let (nearest_cluster, distance) = clusters
985                .iter()
986                .map(|c| {
987                    let dist = self.cosine_distance(vector, &c.centroid);
988                    (c.id, dist)
989                })
990                .min_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
991                .unwrap();
992
993            // Compute anomaly score based on distance and cluster assignment
994            let in_cluster = assigned.contains(embedding_id);
995            let anomaly_score = if in_cluster {
996                // For assigned points, score based on distance to centroid
997                (distance * 2.0).min(1.0)
998            } else {
999                // Unassigned points (noise) get higher base score
1000                (0.5 + distance).min(1.0)
1001            };
1002
1003            if anomaly_score > self.threshold {
1004                let mut anomaly = Anomaly::new(
1005                    *embedding_id,
1006                    anomaly_score,
1007                    nearest_cluster,
1008                    distance,
1009                );
1010
1011                // Classify anomaly type
1012                if !in_cluster {
1013                    anomaly.set_type(AnomalyType::Novel);
1014                } else if distance > 0.5 {
1015                    anomaly.set_type(AnomalyType::Outlier);
1016                } else {
1017                    anomaly.set_type(AnomalyType::Rare);
1018                }
1019
1020                anomalies.push(anomaly);
1021            }
1022        }
1023
1024        info!(
1025            n_anomalies = anomalies.len(),
1026            threshold = self.threshold,
1027            "Anomaly detection completed"
1028        );
1029
1030        Ok(anomalies)
1031    }
1032
1033    /// Compute local outlier factor for embeddings.
1034    #[instrument(skip(self, embeddings))]
1035    pub async fn compute_lof(
1036        &self,
1037        embeddings: &[EmbeddingWithId],
1038    ) -> Result<HashMap<EmbeddingId, f32>> {
1039        if embeddings.len() <= self.k_neighbors {
1040            warn!(
1041                n_embeddings = embeddings.len(),
1042                k = self.k_neighbors,
1043                "Not enough embeddings for LOF computation"
1044            );
1045            return Ok(HashMap::new());
1046        }
1047
1048        let n = embeddings.len();
1049        let k = self.k_neighbors.min(n - 1);
1050
1051        // Compute pairwise distances
1052        let mut distances: Vec<Vec<(usize, f32)>> = Vec::with_capacity(n);
1053        for i in 0..n {
1054            let mut row: Vec<(usize, f32)> = Vec::with_capacity(n - 1);
1055            for j in 0..n {
1056                if i != j {
1057                    let dist = self.cosine_distance(&embeddings[i].1, &embeddings[j].1);
1058                    row.push((j, dist));
1059                }
1060            }
1061            row.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
1062            distances.push(row);
1063        }
1064
1065        // Compute k-distance for each point
1066        let k_distances: Vec<f32> = distances
1067            .iter()
1068            .map(|d| d.get(k - 1).map_or(f32::MAX, |x| x.1))
1069            .collect();
1070
1071        // Compute local reachability density
1072        let mut lrd: Vec<f32> = vec![0.0; n];
1073        for i in 0..n {
1074            let mut reach_dist_sum = 0.0;
1075            for &(j, dist) in distances[i].iter().take(k) {
1076                reach_dist_sum += k_distances[j].max(dist);
1077            }
1078            lrd[i] = if reach_dist_sum > 0.0 {
1079                k as f32 / reach_dist_sum
1080            } else {
1081                f32::MAX
1082            };
1083        }
1084
1085        // Compute LOF
1086        let mut lof_scores: HashMap<EmbeddingId, f32> = HashMap::new();
1087        for i in 0..n {
1088            let mut lof_sum = 0.0;
1089            for &(j, _) in distances[i].iter().take(k) {
1090                if lrd[i] > 0.0 {
1091                    lof_sum += lrd[j] / lrd[i];
1092                }
1093            }
1094            let lof = lof_sum / k as f32;
1095            lof_scores.insert(embeddings[i].0, lof);
1096        }
1097
1098        Ok(lof_scores)
1099    }
1100
1101    /// Classify the type of anomaly based on context.
1102    #[must_use]
1103    pub fn classify_anomaly(
1104        &self,
1105        anomaly: &Anomaly,
1106        cluster_member_count: usize,
1107    ) -> AnomalyType {
1108        if anomaly.distance_to_centroid > 0.8 {
1109            AnomalyType::Artifact
1110        } else if cluster_member_count < 3 {
1111            AnomalyType::Rare
1112        } else if anomaly.local_outlier_factor.map_or(false, |lof| lof > 2.0) {
1113            AnomalyType::Outlier
1114        } else {
1115            AnomalyType::Novel
1116        }
1117    }
1118
1119    fn cosine_distance(&self, a: &[f32], b: &[f32]) -> f32 {
1120        let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
1121        let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
1122        let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
1123        if norm_a == 0.0 || norm_b == 0.0 {
1124            1.0
1125        } else {
1126            1.0 - (dot / (norm_a * norm_b))
1127        }
1128    }
1129}
1130
1131impl Default for AnomalyDetectionService {
1132    fn default() -> Self {
1133        Self::default_service()
1134    }
1135}
1136
1137#[cfg(test)]
1138mod tests {
1139    use super::*;
1140
1141    fn create_test_embeddings(n: usize, dim: usize) -> Vec<EmbeddingWithId> {
1142        (0..n)
1143            .map(|i| {
1144                let vec: Vec<f32> = (0..dim)
1145                    .map(|j| ((i * dim + j) as f32 * 0.01).sin())
1146                    .collect();
1147                (EmbeddingId::new(), vec)
1148            })
1149            .collect()
1150    }
1151
1152    #[tokio::test]
1153    async fn test_clustering_service_insufficient_data() {
1154        let service = ClusteringService::default_service();
1155        let result = service.run_hdbscan(&[]).await;
1156        assert!(result.is_err());
1157    }
1158
1159    #[tokio::test]
1160    async fn test_motif_detection_empty_sequences() {
1161        let service = MotifDetectionService::default_service();
1162        let motifs = service.detect_motifs(&[], 2).await.unwrap();
1163        assert!(motifs.is_empty());
1164    }
1165
1166    #[tokio::test]
1167    async fn test_motif_detection_finds_patterns() {
1168        let service = MotifDetectionService::new(MotifConfig {
1169            min_length: 2,
1170            max_length: 5,
1171            min_occurrences: 2,
1172            min_confidence: 0.0,
1173            allow_overlap: false,
1174            max_gap: 0,
1175        });
1176
1177        let c1 = ClusterId::new();
1178        let c2 = ClusterId::new();
1179        let c3 = ClusterId::new();
1180
1181        let sequences = vec![
1182            vec![c1, c2, c3, c1, c2, c3],
1183            vec![c1, c2, c3, c2, c1],
1184            vec![c2, c1, c2, c3, c1, c2],
1185        ];
1186
1187        let motifs = service.detect_motifs(&sequences, 2).await.unwrap();
1188        assert!(!motifs.is_empty());
1189
1190        // Should find [c1, c2] as a common pattern
1191        let has_c1_c2 = motifs.iter().any(|m| m.sequence == vec![c1, c2]);
1192        assert!(has_c1_c2);
1193    }
1194
1195    #[tokio::test]
1196    async fn test_sequence_analysis_computes_entropy() {
1197        let service = SequenceAnalysisService::new();
1198
1199        let c1 = ClusterId::new();
1200        let c2 = ClusterId::new();
1201
1202        let metrics = service
1203            .compute_metrics(&[c1, c2, c1, c2, c1, c2])
1204            .await
1205            .unwrap();
1206
1207        assert!(metrics.entropy > 0.0);
1208        assert!(metrics.stereotypy >= 0.0 && metrics.stereotypy <= 1.0);
1209    }
1210
1211    #[tokio::test]
1212    async fn test_anomaly_detection_empty_clusters() {
1213        let service = AnomalyDetectionService::default_service();
1214        let embeddings = create_test_embeddings(10, 16);
1215        let anomalies = service.detect_anomalies(&embeddings, &[]).await.unwrap();
1216        assert!(anomalies.is_empty());
1217    }
1218
1219    #[test]
1220    fn test_cosine_distance() {
1221        let service = AnomalyDetectionService::default_service();
1222
1223        let a = vec![1.0, 0.0, 0.0];
1224        let b = vec![1.0, 0.0, 0.0];
1225        assert!((service.cosine_distance(&a, &b) - 0.0).abs() < 0.001);
1226
1227        let c = vec![0.0, 1.0, 0.0];
1228        assert!((service.cosine_distance(&a, &c) - 1.0).abs() < 0.001);
1229    }
1230}
sevensense_analysis/application/services.rs

sevensense_analysis/application/
services.rs