ipfrs_semantic/
vector_quality.rs

1//! Vector quality analysis and validation utilities
2//!
3//! This module provides tools for analyzing and validating embedding vectors,
4//! detecting anomalies, and ensuring data quality in semantic search systems.
5
6/// Statistics about a vector or collection of vectors
7#[derive(Debug, Clone)]
8pub struct VectorStats {
9    /// Mean of all elements
10    pub mean: f32,
11    /// Standard deviation
12    pub std_dev: f32,
13    /// Minimum value
14    pub min: f32,
15    /// Maximum value
16    pub max: f32,
17    /// L2 norm (magnitude)
18    pub l2_norm: f32,
19    /// Number of zero elements
20    pub zero_count: usize,
21    /// Number of NaN or infinite values
22    pub invalid_count: usize,
23    /// Dimension of the vector
24    pub dimension: usize,
25}
26
27/// Quality metrics for a vector
28#[derive(Debug, Clone)]
29pub struct VectorQuality {
30    /// Overall quality score (0.0 - 1.0, higher is better)
31    pub quality_score: f32,
32    /// Whether the vector is valid (no NaN/Inf)
33    pub is_valid: bool,
34    /// Whether the vector is normalized
35    pub is_normalized: bool,
36    /// Sparsity ratio (proportion of near-zero elements)
37    pub sparsity: f32,
38    /// Whether the vector appears to be degenerate (all same values, etc.)
39    pub is_degenerate: bool,
40    /// Detailed statistics
41    pub stats: VectorStats,
42}
43
44/// Anomaly detection result
45#[derive(Debug, Clone)]
46pub struct AnomalyReport {
47    /// Whether an anomaly was detected
48    pub is_anomaly: bool,
49    /// Confidence score (0.0 - 1.0, higher means more confident it's an anomaly)
50    pub confidence: f32,
51    /// Type of anomaly detected
52    pub anomaly_type: AnomalyType,
53    /// Human-readable description
54    pub description: String,
55}
56
57/// Types of anomalies that can be detected
58#[derive(Debug, Clone, Copy, PartialEq, Eq)]
59pub enum AnomalyType {
60    /// Vector contains invalid values (NaN, Inf)
61    InvalidValues,
62    /// Vector is degenerate (all zeros, all same value, etc.)
63    Degenerate,
64    /// Vector has unusual magnitude
65    UnusualMagnitude,
66    /// Vector is too sparse
67    TooSparse,
68    /// Vector has unusual distribution
69    UnusualDistribution,
70    /// No anomaly detected
71    None,
72}
73
74/// Compute statistics for a vector
75pub fn compute_stats(vector: &[f32]) -> VectorStats {
76    let n = vector.len();
77    if n == 0 {
78        return VectorStats {
79            mean: 0.0,
80            std_dev: 0.0,
81            min: 0.0,
82            max: 0.0,
83            l2_norm: 0.0,
84            zero_count: 0,
85            invalid_count: 0,
86            dimension: 0,
87        };
88    }
89
90    let mut sum = 0.0;
91    let mut sum_sq = 0.0;
92    let mut min = f32::INFINITY;
93    let mut max = f32::NEG_INFINITY;
94    let mut zero_count = 0;
95    let mut invalid_count = 0;
96
97    for &val in vector {
98        if !val.is_finite() {
99            invalid_count += 1;
100            continue;
101        }
102
103        sum += val;
104        sum_sq += val * val;
105        min = min.min(val);
106        max = max.max(val);
107
108        if val.abs() < 1e-8 {
109            zero_count += 1;
110        }
111    }
112
113    let mean = sum / n as f32;
114    let variance = (sum_sq / n as f32) - (mean * mean);
115    let std_dev = variance.sqrt();
116    let l2_norm = sum_sq.sqrt();
117
118    VectorStats {
119        mean,
120        std_dev,
121        min,
122        max,
123        l2_norm,
124        zero_count,
125        invalid_count,
126        dimension: n,
127    }
128}
129
130/// Analyze vector quality
131pub fn analyze_quality(vector: &[f32]) -> VectorQuality {
132    let stats = compute_stats(vector);
133
134    // Check validity
135    let is_valid = stats.invalid_count == 0;
136
137    // Check if normalized (L2 norm ≈ 1.0)
138    let is_normalized = (stats.l2_norm - 1.0).abs() < 0.01;
139
140    // Compute sparsity
141    let sparsity = stats.zero_count as f32 / stats.dimension as f32;
142
143    // Check if degenerate
144    let is_degenerate = stats.std_dev < 1e-6 || stats.invalid_count > 0;
145
146    // Compute quality score
147    let mut quality_score: f32 = 1.0;
148
149    // Penalize invalid values
150    if !is_valid {
151        quality_score = 0.0;
152    } else {
153        // Penalize degenerate vectors
154        if is_degenerate {
155            quality_score *= 0.3;
156        }
157
158        // Penalize high sparsity
159        if sparsity > 0.9 {
160            quality_score *= 0.5;
161        } else if sparsity > 0.7 {
162            quality_score *= 0.8;
163        }
164
165        // Slight bonus for normalized vectors
166        if is_normalized {
167            quality_score *= 1.05;
168        }
169
170        // Cap at 1.0
171        quality_score = quality_score.min(1.0);
172    }
173
174    VectorQuality {
175        quality_score,
176        is_valid,
177        is_normalized,
178        sparsity,
179        is_degenerate,
180        stats,
181    }
182}
183
184/// Detect anomalies in a vector compared to a baseline distribution
185///
186/// This function compares a vector's statistics against expected values
187/// to identify potential anomalies.
188#[allow(clippy::too_many_arguments)]
189pub fn detect_anomaly(
190    vector: &[f32],
191    expected_mean: f32,
192    expected_std_dev: f32,
193    expected_l2_norm: f32,
194    mean_tolerance: f32,
195    std_dev_tolerance: f32,
196    norm_tolerance: f32,
197) -> AnomalyReport {
198    let quality = analyze_quality(vector);
199
200    // Check for invalid values
201    if !quality.is_valid {
202        return AnomalyReport {
203            is_anomaly: true,
204            confidence: 1.0,
205            anomaly_type: AnomalyType::InvalidValues,
206            description: format!(
207                "Vector contains {} invalid values (NaN or Inf)",
208                quality.stats.invalid_count
209            ),
210        };
211    }
212
213    // Check for degenerate vectors
214    if quality.is_degenerate {
215        return AnomalyReport {
216            is_anomaly: true,
217            confidence: 0.95,
218            anomaly_type: AnomalyType::Degenerate,
219            description: format!("Vector is degenerate: std_dev={:.6}", quality.stats.std_dev),
220        };
221    }
222
223    // Check sparsity
224    if quality.sparsity > 0.95 {
225        return AnomalyReport {
226            is_anomaly: true,
227            confidence: 0.9,
228            anomaly_type: AnomalyType::TooSparse,
229            description: format!(
230                "Vector is too sparse: {:.1}% zeros",
231                quality.sparsity * 100.0
232            ),
233        };
234    }
235
236    // Check magnitude
237    let norm_diff = (quality.stats.l2_norm - expected_l2_norm).abs();
238    if norm_diff > norm_tolerance {
239        let confidence = (norm_diff / expected_l2_norm).min(1.0);
240        return AnomalyReport {
241            is_anomaly: true,
242            confidence,
243            anomaly_type: AnomalyType::UnusualMagnitude,
244            description: format!(
245                "Unusual magnitude: {:.4} (expected {:.4} ± {:.4})",
246                quality.stats.l2_norm, expected_l2_norm, norm_tolerance
247            ),
248        };
249    }
250
251    // Check mean
252    let mean_diff = (quality.stats.mean - expected_mean).abs();
253    if mean_diff > mean_tolerance {
254        let confidence = (mean_diff / mean_tolerance).min(1.0) * 0.7;
255        return AnomalyReport {
256            is_anomaly: true,
257            confidence,
258            anomaly_type: AnomalyType::UnusualDistribution,
259            description: format!(
260                "Unusual mean: {:.4} (expected {:.4} ± {:.4})",
261                quality.stats.mean, expected_mean, mean_tolerance
262            ),
263        };
264    }
265
266    // Check std dev
267    let std_diff = (quality.stats.std_dev - expected_std_dev).abs();
268    if std_diff > std_dev_tolerance {
269        let confidence = (std_diff / std_dev_tolerance).min(1.0) * 0.6;
270        return AnomalyReport {
271            is_anomaly: true,
272            confidence,
273            anomaly_type: AnomalyType::UnusualDistribution,
274            description: format!(
275                "Unusual std dev: {:.4} (expected {:.4} ± {:.4})",
276                quality.stats.std_dev, expected_std_dev, std_dev_tolerance
277            ),
278        };
279    }
280
281    // No anomaly detected
282    AnomalyReport {
283        is_anomaly: false,
284        confidence: 0.0,
285        anomaly_type: AnomalyType::None,
286        description: "No anomaly detected".to_string(),
287    }
288}
289
290/// Batch statistics for a collection of vectors
291#[derive(Debug, Clone)]
292pub struct BatchStats {
293    /// Number of vectors
294    pub count: usize,
295    /// Average quality score
296    pub avg_quality: f32,
297    /// Number of valid vectors
298    pub valid_count: usize,
299    /// Number of normalized vectors
300    pub normalized_count: usize,
301    /// Average sparsity
302    pub avg_sparsity: f32,
303    /// Statistics across all dimensions
304    pub overall_stats: VectorStats,
305}
306
307/// Compute batch statistics for multiple vectors
308pub fn compute_batch_stats(vectors: &[Vec<f32>]) -> BatchStats {
309    if vectors.is_empty() {
310        return BatchStats {
311            count: 0,
312            avg_quality: 0.0,
313            valid_count: 0,
314            normalized_count: 0,
315            avg_sparsity: 0.0,
316            overall_stats: VectorStats {
317                mean: 0.0,
318                std_dev: 0.0,
319                min: 0.0,
320                max: 0.0,
321                l2_norm: 0.0,
322                zero_count: 0,
323                invalid_count: 0,
324                dimension: 0,
325            },
326        };
327    }
328
329    let mut total_quality = 0.0;
330    let mut valid_count = 0;
331    let mut normalized_count = 0;
332    let mut total_sparsity = 0.0;
333
334    // Collect per-dimension statistics
335    let dim = vectors[0].len();
336    let mut dim_sums = vec![0.0; dim];
337    let mut dim_counts = vec![0; dim];
338
339    for vector in vectors {
340        let quality = analyze_quality(vector);
341        total_quality += quality.quality_score;
342        if quality.is_valid {
343            valid_count += 1;
344        }
345        if quality.is_normalized {
346            normalized_count += 1;
347        }
348        total_sparsity += quality.sparsity;
349
350        // Accumulate dimension statistics
351        for (i, &val) in vector.iter().enumerate() {
352            if i < dim && val.is_finite() {
353                dim_sums[i] += val;
354                dim_counts[i] += 1;
355            }
356        }
357    }
358
359    // Compute overall statistics across all dimensions
360    let all_values: Vec<f32> = vectors.iter().flatten().copied().collect();
361    let overall_stats = compute_stats(&all_values);
362
363    BatchStats {
364        count: vectors.len(),
365        avg_quality: total_quality / vectors.len() as f32,
366        valid_count,
367        normalized_count,
368        avg_sparsity: total_sparsity / vectors.len() as f32,
369        overall_stats,
370    }
371}
372
373/// Find outlier vectors in a batch based on their distance from the mean
374pub fn find_outliers(vectors: &[Vec<f32>], threshold: f32) -> Vec<usize> {
375    if vectors.is_empty() {
376        return Vec::new();
377    }
378
379    let dim = vectors[0].len();
380
381    // Compute mean vector
382    let mut mean_vec = vec![0.0; dim];
383    for vector in vectors {
384        for (i, &val) in vector.iter().enumerate() {
385            if i < dim && val.is_finite() {
386                mean_vec[i] += val;
387            }
388        }
389    }
390    for val in &mut mean_vec {
391        *val /= vectors.len() as f32;
392    }
393
394    // Compute distances from mean
395    let distances: Vec<(usize, f32)> = vectors
396        .iter()
397        .enumerate()
398        .map(|(idx, vector)| {
399            let dist = compute_l2_distance(vector, &mean_vec);
400            (idx, dist)
401        })
402        .collect();
403
404    // Compute mean and std dev of distances
405    let mean_dist: f32 = distances.iter().map(|(_, d)| d).sum::<f32>() / distances.len() as f32;
406    let variance: f32 = distances
407        .iter()
408        .map(|(_, d)| (d - mean_dist).powi(2))
409        .sum::<f32>()
410        / distances.len() as f32;
411    let std_dist = variance.sqrt();
412
413    // Find outliers (distance > mean + threshold * std)
414    let outlier_threshold = mean_dist + threshold * std_dist;
415    distances
416        .into_iter()
417        .filter(|(_, dist)| *dist > outlier_threshold)
418        .map(|(idx, _)| idx)
419        .collect()
420}
421
422/// Compute L2 distance between two vectors
423fn compute_l2_distance(a: &[f32], b: &[f32]) -> f32 {
424    a.iter()
425        .zip(b.iter())
426        .map(|(x, y)| (x - y).powi(2))
427        .sum::<f32>()
428        .sqrt()
429}
430
431/// Compute cosine similarity between two vectors
432pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
433    if a.len() != b.len() {
434        return 0.0;
435    }
436
437    let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
438    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
439    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
440
441    if norm_a == 0.0 || norm_b == 0.0 {
442        return 0.0;
443    }
444
445    dot_product / (norm_a * norm_b)
446}
447
448/// Diversity score for a set of vectors
449///
450/// Measures how diverse a collection of vectors is (0.0 = all identical, 1.0 = maximally diverse)
451pub fn compute_diversity(vectors: &[Vec<f32>]) -> f32 {
452    if vectors.len() < 2 {
453        return 0.0;
454    }
455
456    let mut total_distance = 0.0;
457    let mut count = 0;
458
459    for i in 0..vectors.len() {
460        for j in (i + 1)..vectors.len() {
461            total_distance += compute_l2_distance(&vectors[i], &vectors[j]);
462            count += 1;
463        }
464    }
465
466    if count == 0 {
467        return 0.0;
468    }
469
470    // Normalize by the maximum possible distance (assuming unit vectors)
471    let avg_distance = total_distance / count as f32;
472    let max_distance = 2.0_f32.sqrt(); // Max L2 distance between unit vectors
473
474    (avg_distance / max_distance).min(1.0)
475}
476
477#[cfg(test)]
478mod tests {
479    use super::*;
480
481    #[test]
482    fn test_compute_stats() {
483        let vector = vec![1.0, 2.0, 3.0, 4.0, 5.0];
484        let stats = compute_stats(&vector);
485
486        assert_eq!(stats.dimension, 5);
487        assert_eq!(stats.mean, 3.0);
488        assert_eq!(stats.min, 1.0);
489        assert_eq!(stats.max, 5.0);
490        assert_eq!(stats.invalid_count, 0);
491    }
492
493    #[test]
494    fn test_analyze_quality_valid() {
495        let vector = vec![0.1, 0.2, 0.3, 0.4, 0.5];
496        let quality = analyze_quality(&vector);
497
498        assert!(quality.is_valid);
499        assert!(!quality.is_degenerate);
500        assert!(quality.quality_score > 0.5);
501    }
502
503    #[test]
504    fn test_analyze_quality_invalid() {
505        let vector = vec![f32::NAN, 0.2, 0.3, 0.4, 0.5];
506        let quality = analyze_quality(&vector);
507
508        assert!(!quality.is_valid);
509        assert_eq!(quality.quality_score, 0.0);
510    }
511
512    #[test]
513    fn test_analyze_quality_degenerate() {
514        let vector = vec![1.0, 1.0, 1.0, 1.0, 1.0];
515        let quality = analyze_quality(&vector);
516
517        assert!(quality.is_degenerate);
518        assert!(quality.quality_score < 0.5);
519    }
520
521    #[test]
522    fn test_detect_anomaly_invalid() {
523        let vector = vec![f32::NAN, 0.2, 0.3];
524        let report = detect_anomaly(&vector, 0.0, 1.0, 1.0, 0.1, 0.1, 0.1);
525
526        assert!(report.is_anomaly);
527        assert_eq!(report.anomaly_type, AnomalyType::InvalidValues);
528    }
529
530    #[test]
531    fn test_detect_anomaly_normal() {
532        let vector = vec![0.1, 0.2, 0.3, 0.4, 0.5];
533        let stats = compute_stats(&vector);
534        let report = detect_anomaly(
535            &vector,
536            stats.mean,
537            stats.std_dev,
538            stats.l2_norm,
539            0.5,
540            0.5,
541            0.5,
542        );
543
544        assert!(!report.is_anomaly);
545        assert_eq!(report.anomaly_type, AnomalyType::None);
546    }
547
548    #[test]
549    fn test_compute_batch_stats() {
550        let vectors = vec![
551            vec![0.1, 0.2, 0.3],
552            vec![0.4, 0.5, 0.6],
553            vec![0.7, 0.8, 0.9],
554        ];
555
556        let stats = compute_batch_stats(&vectors);
557
558        assert_eq!(stats.count, 3);
559        assert!(stats.avg_quality > 0.0);
560        assert_eq!(stats.valid_count, 3);
561    }
562
563    #[test]
564    fn test_find_outliers() {
565        let vectors = vec![
566            vec![0.0, 0.0, 0.0],
567            vec![0.1, 0.1, 0.1],
568            vec![0.2, 0.2, 0.2],
569            vec![10.0, 10.0, 10.0], // Obvious outlier
570        ];
571
572        let outliers = find_outliers(&vectors, 1.0);
573
574        assert!(
575            outliers.contains(&3),
576            "Expected vector at index 3 to be detected as outlier"
577        );
578        assert_eq!(outliers.len(), 1, "Expected exactly one outlier");
579    }
580
581    #[test]
582    fn test_cosine_similarity() {
583        let a = vec![1.0, 0.0, 0.0];
584        let b = vec![1.0, 0.0, 0.0];
585
586        let sim = cosine_similarity(&a, &b);
587        assert!((sim - 1.0).abs() < 1e-6);
588
589        let c = vec![0.0, 1.0, 0.0];
590        let sim2 = cosine_similarity(&a, &c);
591        assert!(sim2.abs() < 1e-6);
592    }
593
594    #[test]
595    fn test_compute_diversity() {
596        // All identical vectors
597        let identical = vec![vec![1.0, 0.0], vec![1.0, 0.0], vec![1.0, 0.0]];
598        assert_eq!(compute_diversity(&identical), 0.0);
599
600        // Diverse vectors
601        let diverse = vec![vec![1.0, 0.0], vec![0.0, 1.0], vec![-1.0, 0.0]];
602        assert!(compute_diversity(&diverse) > 0.5);
603    }
604}