1#[derive(Debug, Clone)]
8pub struct VectorStats {
9 pub mean: f32,
11 pub std_dev: f32,
13 pub min: f32,
15 pub max: f32,
17 pub l2_norm: f32,
19 pub zero_count: usize,
21 pub invalid_count: usize,
23 pub dimension: usize,
25}
26
27#[derive(Debug, Clone)]
29pub struct VectorQuality {
30 pub quality_score: f32,
32 pub is_valid: bool,
34 pub is_normalized: bool,
36 pub sparsity: f32,
38 pub is_degenerate: bool,
40 pub stats: VectorStats,
42}
43
44#[derive(Debug, Clone)]
46pub struct AnomalyReport {
47 pub is_anomaly: bool,
49 pub confidence: f32,
51 pub anomaly_type: AnomalyType,
53 pub description: String,
55}
56
57#[derive(Debug, Clone, Copy, PartialEq, Eq)]
59pub enum AnomalyType {
60 InvalidValues,
62 Degenerate,
64 UnusualMagnitude,
66 TooSparse,
68 UnusualDistribution,
70 None,
72}
73
74pub fn compute_stats(vector: &[f32]) -> VectorStats {
76 let n = vector.len();
77 if n == 0 {
78 return VectorStats {
79 mean: 0.0,
80 std_dev: 0.0,
81 min: 0.0,
82 max: 0.0,
83 l2_norm: 0.0,
84 zero_count: 0,
85 invalid_count: 0,
86 dimension: 0,
87 };
88 }
89
90 let mut sum = 0.0;
91 let mut sum_sq = 0.0;
92 let mut min = f32::INFINITY;
93 let mut max = f32::NEG_INFINITY;
94 let mut zero_count = 0;
95 let mut invalid_count = 0;
96
97 for &val in vector {
98 if !val.is_finite() {
99 invalid_count += 1;
100 continue;
101 }
102
103 sum += val;
104 sum_sq += val * val;
105 min = min.min(val);
106 max = max.max(val);
107
108 if val.abs() < 1e-8 {
109 zero_count += 1;
110 }
111 }
112
113 let mean = sum / n as f32;
114 let variance = (sum_sq / n as f32) - (mean * mean);
115 let std_dev = variance.sqrt();
116 let l2_norm = sum_sq.sqrt();
117
118 VectorStats {
119 mean,
120 std_dev,
121 min,
122 max,
123 l2_norm,
124 zero_count,
125 invalid_count,
126 dimension: n,
127 }
128}
129
130pub fn analyze_quality(vector: &[f32]) -> VectorQuality {
132 let stats = compute_stats(vector);
133
134 let is_valid = stats.invalid_count == 0;
136
137 let is_normalized = (stats.l2_norm - 1.0).abs() < 0.01;
139
140 let sparsity = stats.zero_count as f32 / stats.dimension as f32;
142
143 let is_degenerate = stats.std_dev < 1e-6 || stats.invalid_count > 0;
145
146 let mut quality_score: f32 = 1.0;
148
149 if !is_valid {
151 quality_score = 0.0;
152 } else {
153 if is_degenerate {
155 quality_score *= 0.3;
156 }
157
158 if sparsity > 0.9 {
160 quality_score *= 0.5;
161 } else if sparsity > 0.7 {
162 quality_score *= 0.8;
163 }
164
165 if is_normalized {
167 quality_score *= 1.05;
168 }
169
170 quality_score = quality_score.min(1.0);
172 }
173
174 VectorQuality {
175 quality_score,
176 is_valid,
177 is_normalized,
178 sparsity,
179 is_degenerate,
180 stats,
181 }
182}
183
184#[allow(clippy::too_many_arguments)]
189pub fn detect_anomaly(
190 vector: &[f32],
191 expected_mean: f32,
192 expected_std_dev: f32,
193 expected_l2_norm: f32,
194 mean_tolerance: f32,
195 std_dev_tolerance: f32,
196 norm_tolerance: f32,
197) -> AnomalyReport {
198 let quality = analyze_quality(vector);
199
200 if !quality.is_valid {
202 return AnomalyReport {
203 is_anomaly: true,
204 confidence: 1.0,
205 anomaly_type: AnomalyType::InvalidValues,
206 description: format!(
207 "Vector contains {} invalid values (NaN or Inf)",
208 quality.stats.invalid_count
209 ),
210 };
211 }
212
213 if quality.is_degenerate {
215 return AnomalyReport {
216 is_anomaly: true,
217 confidence: 0.95,
218 anomaly_type: AnomalyType::Degenerate,
219 description: format!("Vector is degenerate: std_dev={:.6}", quality.stats.std_dev),
220 };
221 }
222
223 if quality.sparsity > 0.95 {
225 return AnomalyReport {
226 is_anomaly: true,
227 confidence: 0.9,
228 anomaly_type: AnomalyType::TooSparse,
229 description: format!(
230 "Vector is too sparse: {:.1}% zeros",
231 quality.sparsity * 100.0
232 ),
233 };
234 }
235
236 let norm_diff = (quality.stats.l2_norm - expected_l2_norm).abs();
238 if norm_diff > norm_tolerance {
239 let confidence = (norm_diff / expected_l2_norm).min(1.0);
240 return AnomalyReport {
241 is_anomaly: true,
242 confidence,
243 anomaly_type: AnomalyType::UnusualMagnitude,
244 description: format!(
245 "Unusual magnitude: {:.4} (expected {:.4} ± {:.4})",
246 quality.stats.l2_norm, expected_l2_norm, norm_tolerance
247 ),
248 };
249 }
250
251 let mean_diff = (quality.stats.mean - expected_mean).abs();
253 if mean_diff > mean_tolerance {
254 let confidence = (mean_diff / mean_tolerance).min(1.0) * 0.7;
255 return AnomalyReport {
256 is_anomaly: true,
257 confidence,
258 anomaly_type: AnomalyType::UnusualDistribution,
259 description: format!(
260 "Unusual mean: {:.4} (expected {:.4} ± {:.4})",
261 quality.stats.mean, expected_mean, mean_tolerance
262 ),
263 };
264 }
265
266 let std_diff = (quality.stats.std_dev - expected_std_dev).abs();
268 if std_diff > std_dev_tolerance {
269 let confidence = (std_diff / std_dev_tolerance).min(1.0) * 0.6;
270 return AnomalyReport {
271 is_anomaly: true,
272 confidence,
273 anomaly_type: AnomalyType::UnusualDistribution,
274 description: format!(
275 "Unusual std dev: {:.4} (expected {:.4} ± {:.4})",
276 quality.stats.std_dev, expected_std_dev, std_dev_tolerance
277 ),
278 };
279 }
280
281 AnomalyReport {
283 is_anomaly: false,
284 confidence: 0.0,
285 anomaly_type: AnomalyType::None,
286 description: "No anomaly detected".to_string(),
287 }
288}
289
290#[derive(Debug, Clone)]
292pub struct BatchStats {
293 pub count: usize,
295 pub avg_quality: f32,
297 pub valid_count: usize,
299 pub normalized_count: usize,
301 pub avg_sparsity: f32,
303 pub overall_stats: VectorStats,
305}
306
307pub fn compute_batch_stats(vectors: &[Vec<f32>]) -> BatchStats {
309 if vectors.is_empty() {
310 return BatchStats {
311 count: 0,
312 avg_quality: 0.0,
313 valid_count: 0,
314 normalized_count: 0,
315 avg_sparsity: 0.0,
316 overall_stats: VectorStats {
317 mean: 0.0,
318 std_dev: 0.0,
319 min: 0.0,
320 max: 0.0,
321 l2_norm: 0.0,
322 zero_count: 0,
323 invalid_count: 0,
324 dimension: 0,
325 },
326 };
327 }
328
329 let mut total_quality = 0.0;
330 let mut valid_count = 0;
331 let mut normalized_count = 0;
332 let mut total_sparsity = 0.0;
333
334 let dim = vectors[0].len();
336 let mut dim_sums = vec![0.0; dim];
337 let mut dim_counts = vec![0; dim];
338
339 for vector in vectors {
340 let quality = analyze_quality(vector);
341 total_quality += quality.quality_score;
342 if quality.is_valid {
343 valid_count += 1;
344 }
345 if quality.is_normalized {
346 normalized_count += 1;
347 }
348 total_sparsity += quality.sparsity;
349
350 for (i, &val) in vector.iter().enumerate() {
352 if i < dim && val.is_finite() {
353 dim_sums[i] += val;
354 dim_counts[i] += 1;
355 }
356 }
357 }
358
359 let all_values: Vec<f32> = vectors.iter().flatten().copied().collect();
361 let overall_stats = compute_stats(&all_values);
362
363 BatchStats {
364 count: vectors.len(),
365 avg_quality: total_quality / vectors.len() as f32,
366 valid_count,
367 normalized_count,
368 avg_sparsity: total_sparsity / vectors.len() as f32,
369 overall_stats,
370 }
371}
372
373pub fn find_outliers(vectors: &[Vec<f32>], threshold: f32) -> Vec<usize> {
375 if vectors.is_empty() {
376 return Vec::new();
377 }
378
379 let dim = vectors[0].len();
380
381 let mut mean_vec = vec![0.0; dim];
383 for vector in vectors {
384 for (i, &val) in vector.iter().enumerate() {
385 if i < dim && val.is_finite() {
386 mean_vec[i] += val;
387 }
388 }
389 }
390 for val in &mut mean_vec {
391 *val /= vectors.len() as f32;
392 }
393
394 let distances: Vec<(usize, f32)> = vectors
396 .iter()
397 .enumerate()
398 .map(|(idx, vector)| {
399 let dist = compute_l2_distance(vector, &mean_vec);
400 (idx, dist)
401 })
402 .collect();
403
404 let mean_dist: f32 = distances.iter().map(|(_, d)| d).sum::<f32>() / distances.len() as f32;
406 let variance: f32 = distances
407 .iter()
408 .map(|(_, d)| (d - mean_dist).powi(2))
409 .sum::<f32>()
410 / distances.len() as f32;
411 let std_dist = variance.sqrt();
412
413 let outlier_threshold = mean_dist + threshold * std_dist;
415 distances
416 .into_iter()
417 .filter(|(_, dist)| *dist > outlier_threshold)
418 .map(|(idx, _)| idx)
419 .collect()
420}
421
422fn compute_l2_distance(a: &[f32], b: &[f32]) -> f32 {
424 a.iter()
425 .zip(b.iter())
426 .map(|(x, y)| (x - y).powi(2))
427 .sum::<f32>()
428 .sqrt()
429}
430
431pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
433 if a.len() != b.len() {
434 return 0.0;
435 }
436
437 let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
438 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
439 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
440
441 if norm_a == 0.0 || norm_b == 0.0 {
442 return 0.0;
443 }
444
445 dot_product / (norm_a * norm_b)
446}
447
448pub fn compute_diversity(vectors: &[Vec<f32>]) -> f32 {
452 if vectors.len() < 2 {
453 return 0.0;
454 }
455
456 let mut total_distance = 0.0;
457 let mut count = 0;
458
459 for i in 0..vectors.len() {
460 for j in (i + 1)..vectors.len() {
461 total_distance += compute_l2_distance(&vectors[i], &vectors[j]);
462 count += 1;
463 }
464 }
465
466 if count == 0 {
467 return 0.0;
468 }
469
470 let avg_distance = total_distance / count as f32;
472 let max_distance = 2.0_f32.sqrt(); (avg_distance / max_distance).min(1.0)
475}
476
477#[cfg(test)]
478mod tests {
479 use super::*;
480
481 #[test]
482 fn test_compute_stats() {
483 let vector = vec![1.0, 2.0, 3.0, 4.0, 5.0];
484 let stats = compute_stats(&vector);
485
486 assert_eq!(stats.dimension, 5);
487 assert_eq!(stats.mean, 3.0);
488 assert_eq!(stats.min, 1.0);
489 assert_eq!(stats.max, 5.0);
490 assert_eq!(stats.invalid_count, 0);
491 }
492
493 #[test]
494 fn test_analyze_quality_valid() {
495 let vector = vec![0.1, 0.2, 0.3, 0.4, 0.5];
496 let quality = analyze_quality(&vector);
497
498 assert!(quality.is_valid);
499 assert!(!quality.is_degenerate);
500 assert!(quality.quality_score > 0.5);
501 }
502
503 #[test]
504 fn test_analyze_quality_invalid() {
505 let vector = vec![f32::NAN, 0.2, 0.3, 0.4, 0.5];
506 let quality = analyze_quality(&vector);
507
508 assert!(!quality.is_valid);
509 assert_eq!(quality.quality_score, 0.0);
510 }
511
512 #[test]
513 fn test_analyze_quality_degenerate() {
514 let vector = vec![1.0, 1.0, 1.0, 1.0, 1.0];
515 let quality = analyze_quality(&vector);
516
517 assert!(quality.is_degenerate);
518 assert!(quality.quality_score < 0.5);
519 }
520
521 #[test]
522 fn test_detect_anomaly_invalid() {
523 let vector = vec![f32::NAN, 0.2, 0.3];
524 let report = detect_anomaly(&vector, 0.0, 1.0, 1.0, 0.1, 0.1, 0.1);
525
526 assert!(report.is_anomaly);
527 assert_eq!(report.anomaly_type, AnomalyType::InvalidValues);
528 }
529
530 #[test]
531 fn test_detect_anomaly_normal() {
532 let vector = vec![0.1, 0.2, 0.3, 0.4, 0.5];
533 let stats = compute_stats(&vector);
534 let report = detect_anomaly(
535 &vector,
536 stats.mean,
537 stats.std_dev,
538 stats.l2_norm,
539 0.5,
540 0.5,
541 0.5,
542 );
543
544 assert!(!report.is_anomaly);
545 assert_eq!(report.anomaly_type, AnomalyType::None);
546 }
547
548 #[test]
549 fn test_compute_batch_stats() {
550 let vectors = vec![
551 vec![0.1, 0.2, 0.3],
552 vec![0.4, 0.5, 0.6],
553 vec![0.7, 0.8, 0.9],
554 ];
555
556 let stats = compute_batch_stats(&vectors);
557
558 assert_eq!(stats.count, 3);
559 assert!(stats.avg_quality > 0.0);
560 assert_eq!(stats.valid_count, 3);
561 }
562
563 #[test]
564 fn test_find_outliers() {
565 let vectors = vec![
566 vec![0.0, 0.0, 0.0],
567 vec![0.1, 0.1, 0.1],
568 vec![0.2, 0.2, 0.2],
569 vec![10.0, 10.0, 10.0], ];
571
572 let outliers = find_outliers(&vectors, 1.0);
573
574 assert!(
575 outliers.contains(&3),
576 "Expected vector at index 3 to be detected as outlier"
577 );
578 assert_eq!(outliers.len(), 1, "Expected exactly one outlier");
579 }
580
581 #[test]
582 fn test_cosine_similarity() {
583 let a = vec![1.0, 0.0, 0.0];
584 let b = vec![1.0, 0.0, 0.0];
585
586 let sim = cosine_similarity(&a, &b);
587 assert!((sim - 1.0).abs() < 1e-6);
588
589 let c = vec![0.0, 1.0, 0.0];
590 let sim2 = cosine_similarity(&a, &c);
591 assert!(sim2.abs() < 1e-6);
592 }
593
594 #[test]
595 fn test_compute_diversity() {
596 let identical = vec![vec![1.0, 0.0], vec![1.0, 0.0], vec![1.0, 0.0]];
598 assert_eq!(compute_diversity(&identical), 0.0);
599
600 let diverse = vec![vec![1.0, 0.0], vec![0.0, 1.0], vec![-1.0, 0.0]];
602 assert!(compute_diversity(&diverse) > 0.5);
603 }
604}