1use crate::{
14 benchmarking::{BenchmarkConfig, BenchmarkDataset, BenchmarkOutputFormat},
15 Vector, VectorIndex,
16};
17use anyhow::{anyhow, Result};
18use serde::{Deserialize, Serialize};
19use std::collections::HashMap;
20use std::time::{Duration, Instant};
21
22#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct AdvancedBenchmarkConfig {
25 pub base_config: BenchmarkConfig,
27 pub confidence_level: f64,
29 pub min_runs: usize,
31 pub max_cv: f64,
33 pub memory_profiling: bool,
35 pub latency_distribution: bool,
37 pub throughput_testing: bool,
39 pub quality_degradation: bool,
41 pub hyperparameter_optimization: bool,
43 pub comparative_analysis: bool,
45 pub ann_benchmarks_mode: bool,
47 pub export_traces: bool,
49 pub parallel_config: ParallelBenchmarkConfig,
51}
52
53#[derive(Debug, Clone, Serialize, Deserialize)]
55pub struct ParallelBenchmarkConfig {
56 pub num_threads: usize,
58 pub numa_aware: bool,
60 pub thread_affinity: bool,
62 pub memory_bandwidth_test: bool,
64}
65
66pub struct AdvancedBenchmarkSuite {
68 config: AdvancedBenchmarkConfig,
69 datasets: Vec<EnhancedBenchmarkDataset>,
70 algorithms: Vec<BenchmarkAlgorithm>,
71 results: Vec<AdvancedBenchmarkResult>,
72 #[allow(dead_code)]
73 statistical_analyzer: StatisticalAnalyzer,
74 #[allow(dead_code)]
75 performance_profiler: PerformanceProfiler,
76 #[allow(dead_code)]
77 hyperparameter_tuner: HyperparameterTuner,
78}
79
80#[derive(Debug, Clone, Serialize, Deserialize)]
82pub struct EnhancedBenchmarkDataset {
83 pub base_dataset: BenchmarkDataset,
85 pub statistics: DatasetStatistics,
87 pub quality_metrics: DatasetQualityMetrics,
89 pub intrinsic_dimensionality: f32,
91 pub clustering_coefficient: f32,
93 pub hubness_score: f32,
95 pub local_id: Vec<f32>,
97}
98
99#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct DatasetStatistics {
102 pub vector_count: usize,
104 pub dimensions: usize,
106 pub mean_magnitude: f32,
108 pub std_magnitude: f32,
110 pub distance_stats: DistanceStatistics,
112 pub nn_distribution: Vec<f32>,
114 pub sparsity_ratio: Option<f32>,
116}
117
118#[derive(Debug, Clone, Serialize, Deserialize)]
120pub struct DistanceStatistics {
121 pub mean_distance: f32,
123 pub std_distance: f32,
125 pub min_distance: f32,
127 pub max_distance: f32,
129 pub percentiles: Vec<(f32, f32)>, }
132
133#[derive(Debug, Clone, Serialize, Deserialize)]
135pub struct DatasetQualityMetrics {
136 pub effective_dimensionality: f32,
138 pub concentration_measure: f32,
140 pub outlier_ratio: f32,
142 pub cluster_quality: f32,
144 pub manifold_quality: f32,
146}
147
148pub struct BenchmarkAlgorithm {
150 pub name: String,
151 pub description: String,
152 pub index: Box<dyn VectorIndex>,
153 pub parameters: AlgorithmParameters,
154 pub build_time: Option<Duration>,
155 pub memory_usage: Option<usize>,
156}
157
158#[derive(Debug, Clone, Serialize, Deserialize)]
160pub struct AlgorithmParameters {
161 pub params: HashMap<String, ParameterValue>,
163 pub search_params: HashMap<String, ParameterValue>,
165 pub build_params: HashMap<String, ParameterValue>,
167}
168
169#[derive(Debug, Clone, Serialize, Deserialize)]
171pub enum ParameterValue {
172 Integer(i64),
173 Float(f64),
174 String(String),
175 Boolean(bool),
176 IntegerRange(i64, i64, i64), FloatRange(f64, f64, f64), }
179
180#[derive(Debug, Clone, Serialize, Deserialize)]
182pub struct AdvancedBenchmarkResult {
183 pub algorithm_name: String,
185 pub dataset_name: String,
186 pub timestamp: std::time::SystemTime,
187
188 pub performance: PerformanceMetrics,
190 pub quality: QualityMetrics,
192 pub scalability: ScalabilityMetrics,
194 pub memory: MemoryMetrics,
196 pub statistics: StatisticalMetrics,
198
199 pub traces: Option<BenchmarkTraces>,
201 pub errors: Vec<String>,
203}
204
205impl Default for AdvancedBenchmarkResult {
206 fn default() -> Self {
207 Self {
208 algorithm_name: String::new(),
209 dataset_name: String::new(),
210 timestamp: std::time::SystemTime::now(),
211 performance: PerformanceMetrics {
212 latency: LatencyMetrics {
213 mean_ms: 0.0,
214 std_ms: 0.0,
215 percentiles: std::collections::HashMap::new(),
216 distribution: Vec::new(),
217 max_ms: 0.0,
218 min_ms: 0.0,
219 },
220 throughput: ThroughputMetrics {
221 qps: 0.0,
222 batch_qps: std::collections::HashMap::new(),
223 concurrent_qps: std::collections::HashMap::new(),
224 saturation_qps: 0.0,
225 },
226 build_time: BuildTimeMetrics {
227 total_seconds: 0.0,
228 per_vector_ms: 0.0,
229 allocation_seconds: 0.0,
230 construction_seconds: 0.0,
231 optimization_seconds: 0.0,
232 },
233 index_size: IndexSizeMetrics {
234 total_bytes: 0,
235 per_vector_bytes: 0.0,
236 overhead_ratio: 0.0,
237 compression_ratio: 0.0,
238 serialized_bytes: 0,
239 },
240 },
241 quality: QualityMetrics {
242 recall_at_k: std::collections::HashMap::new(),
243 precision_at_k: std::collections::HashMap::new(),
244 mean_average_precision: 0.0,
245 ndcg_at_k: std::collections::HashMap::new(),
246 f1_at_k: std::collections::HashMap::new(),
247 mean_reciprocal_rank: 0.0,
248 quality_degradation: QualityDegradation {
249 recall_latency_tradeoff: Vec::new(),
250 quality_size_tradeoff: Vec::new(),
251 quality_buildtime_tradeoff: Vec::new(),
252 },
253 },
254 scalability: ScalabilityMetrics {
255 latency_scaling: Vec::new(),
256 memory_scaling: Vec::new(),
257 buildtime_scaling: Vec::new(),
258 throughput_scaling: Vec::new(),
259 scaling_efficiency: 0.0,
260 },
261 memory: MemoryMetrics {
262 peak_memory_mb: 0.0,
263 average_memory_mb: 0.0,
264 allocation_patterns: Vec::new(),
265 fragmentation_ratio: 0.0,
266 cache_metrics: CacheMetrics {
267 l1_hit_ratio: 0.0,
268 l2_hit_ratio: 0.0,
269 l3_hit_ratio: 0.0,
270 memory_bandwidth_util: 0.0,
271 },
272 },
273 statistics: StatisticalMetrics {
274 sample_size: 0,
275 confidence_intervals: std::collections::HashMap::new(),
276 significance_tests: std::collections::HashMap::new(),
277 effect_sizes: std::collections::HashMap::new(),
278 power_analysis: PowerAnalysis {
279 power: 0.0,
280 effect_size: 0.0,
281 required_sample_size: 0,
282 },
283 },
284 traces: None,
285 errors: Vec::new(),
286 }
287 }
288}
289
290#[derive(Debug, Clone, Serialize, Deserialize)]
292pub struct PerformanceMetrics {
293 pub latency: LatencyMetrics,
295 pub throughput: ThroughputMetrics,
297 pub build_time: BuildTimeMetrics,
299 pub index_size: IndexSizeMetrics,
301}
302
303#[derive(Debug, Clone, Serialize, Deserialize)]
305pub struct LatencyMetrics {
306 pub mean_ms: f64,
308 pub std_ms: f64,
310 pub percentiles: HashMap<String, f64>, pub distribution: Vec<f64>,
314 pub max_ms: f64,
316 pub min_ms: f64,
318}
319
320#[derive(Debug, Clone, Serialize, Deserialize)]
322pub struct ThroughputMetrics {
323 pub qps: f64,
325 pub batch_qps: HashMap<usize, f64>, pub concurrent_qps: HashMap<usize, f64>, pub saturation_qps: f64,
331}
332
333#[derive(Debug, Clone, Serialize, Deserialize)]
335pub struct BuildTimeMetrics {
336 pub total_seconds: f64,
338 pub per_vector_ms: f64,
340 pub allocation_seconds: f64,
342 pub construction_seconds: f64,
344 pub optimization_seconds: f64,
346}
347
348#[derive(Debug, Clone, Serialize, Deserialize)]
350pub struct IndexSizeMetrics {
351 pub total_bytes: usize,
353 pub per_vector_bytes: f64,
355 pub overhead_ratio: f64,
357 pub compression_ratio: f64,
359 pub serialized_bytes: usize,
361}
362
363#[derive(Debug, Clone, Serialize, Deserialize)]
365pub struct QualityMetrics {
366 pub recall_at_k: HashMap<usize, f64>,
368 pub precision_at_k: HashMap<usize, f64>,
370 pub mean_average_precision: f64,
372 pub ndcg_at_k: HashMap<usize, f64>,
374 pub f1_at_k: HashMap<usize, f64>,
376 pub mean_reciprocal_rank: f64,
378 pub quality_degradation: QualityDegradation,
380}
381
382#[derive(Debug, Clone, Serialize, Deserialize)]
384pub struct QualityDegradation {
385 pub recall_latency_tradeoff: Vec<(f64, f64)>, pub quality_size_tradeoff: Vec<(f64, usize)>, pub quality_buildtime_tradeoff: Vec<(f64, f64)>, }
392
393#[derive(Debug, Clone, Serialize, Deserialize)]
395pub struct ScalabilityMetrics {
396 pub latency_scaling: Vec<(usize, f64)>, pub memory_scaling: Vec<(usize, usize)>, pub buildtime_scaling: Vec<(usize, f64)>, pub throughput_scaling: Vec<(usize, f64)>, pub scaling_efficiency: f64,
406}
407
408#[derive(Debug, Clone, Serialize, Deserialize)]
410pub struct MemoryMetrics {
411 pub peak_memory_mb: f64,
413 pub average_memory_mb: f64,
415 pub allocation_patterns: Vec<MemoryAllocation>,
417 pub fragmentation_ratio: f64,
419 pub cache_metrics: CacheMetrics,
421}
422
423#[derive(Debug, Clone, Serialize, Deserialize)]
425pub struct MemoryAllocation {
426 pub timestamp_ms: u64,
428 pub allocated_bytes: usize,
430 pub allocation_type: String,
432}
433
434#[derive(Debug, Clone, Serialize, Deserialize)]
436pub struct CacheMetrics {
437 pub l1_hit_ratio: f64,
439 pub l2_hit_ratio: f64,
441 pub l3_hit_ratio: f64,
443 pub memory_bandwidth_util: f64,
445}
446
447#[derive(Debug, Clone, Serialize, Deserialize)]
449pub struct StatisticalMetrics {
450 pub sample_size: usize,
452 pub confidence_intervals: HashMap<String, (f64, f64)>, pub significance_tests: HashMap<String, StatisticalTest>,
456 pub effect_sizes: HashMap<String, f64>,
458 pub power_analysis: PowerAnalysis,
460}
461
462#[derive(Debug, Clone, Serialize, Deserialize)]
464pub struct StatisticalTest {
465 pub test_type: String,
467 pub p_value: f64,
469 pub test_statistic: f64,
471 pub is_significant: bool,
473}
474
475#[derive(Debug, Clone, Serialize, Deserialize)]
477pub struct PowerAnalysis {
478 pub power: f64,
480 pub effect_size: f64,
482 pub required_sample_size: usize,
484}
485
486#[derive(Debug, Clone, Serialize, Deserialize)]
488pub struct BenchmarkTraces {
489 pub query_traces: Vec<QueryTrace>,
491 pub system_traces: Vec<SystemTrace>,
493 pub memory_traces: Vec<MemoryTrace>,
495}
496
497#[derive(Debug, Clone, Serialize, Deserialize)]
499pub struct QueryTrace {
500 pub query_id: usize,
502 pub start_time: u64,
504 pub end_time: u64,
506 pub results_count: usize,
508 pub distance_computations: usize,
510 pub cache_hits: usize,
512 pub memory_allocations: usize,
514}
515
516#[derive(Debug, Clone, Serialize, Deserialize)]
518pub struct SystemTrace {
519 pub timestamp: u64,
521 pub cpu_usage: f64,
523 pub memory_usage: usize,
525 pub io_operations: usize,
527 pub context_switches: usize,
529}
530
531#[derive(Debug, Clone, Serialize, Deserialize)]
533pub struct MemoryTrace {
534 pub timestamp: u64,
536 pub heap_usage: usize,
538 pub stack_usage: usize,
540 pub page_faults: usize,
542 pub memory_bandwidth: f64,
544}
545
546pub struct StatisticalAnalyzer {
548 #[allow(dead_code)]
549 confidence_level: f64,
550 min_sample_size: usize,
551 #[allow(dead_code)]
552 outlier_threshold: f64,
553}
554
555pub struct PerformanceProfiler {
557 #[allow(dead_code)]
558 enable_memory_profiling: bool,
559 #[allow(dead_code)]
560 enable_cache_profiling: bool,
561 #[allow(dead_code)]
562 enable_cpu_profiling: bool,
563 #[allow(dead_code)]
564 sample_interval: Duration,
565}
566
567pub struct HyperparameterTuner {
569 #[allow(dead_code)]
570 optimization_strategy: OptimizationStrategy,
571 #[allow(dead_code)]
572 search_space: HashMap<String, ParameterSpace>,
573 #[allow(dead_code)]
574 objective_function: ObjectiveFunction,
575 #[allow(dead_code)]
576 max_iterations: usize,
577}
578
579#[derive(Debug, Clone, Serialize, Deserialize)]
581pub enum OptimizationStrategy {
582 GridSearch,
583 RandomSearch,
584 BayesianOptimization,
585 EvolutionaryOptimization,
586 MultiObjective,
587}
588
589#[derive(Debug, Clone, Serialize, Deserialize)]
591pub struct ParameterSpace {
592 pub parameter_type: ParameterType,
593 pub constraints: Vec<ParameterConstraint>,
594}
595
596#[derive(Debug, Clone, Serialize, Deserialize)]
597pub enum ParameterType {
598 Categorical(Vec<String>),
599 Continuous { min: f64, max: f64 },
600 Integer { min: i64, max: i64 },
601 Boolean,
602}
603
604#[derive(Debug, Clone, Serialize, Deserialize)]
605pub enum ParameterConstraint {
606 GreaterThan(f64),
607 LessThan(f64),
608 Conditional {
609 if_param: String,
610 if_value: String,
611 then_constraint: Box<ParameterConstraint>,
612 },
613}
614
615#[derive(Debug, Clone, Serialize, Deserialize)]
617pub enum ObjectiveFunction {
618 Recall { k: usize, weight: f64 },
619 Latency { percentile: f64, weight: f64 },
620 Throughput { weight: f64 },
621 MemoryUsage { weight: f64 },
622 Composite { objectives: Vec<ObjectiveFunction> },
623 Pareto { objectives: Vec<ObjectiveFunction> },
624}
625
626impl Default for AdvancedBenchmarkConfig {
627 fn default() -> Self {
628 Self::new()
629 }
630}
631
632impl AdvancedBenchmarkConfig {
633 pub fn new() -> Self {
634 Self {
635 base_config: BenchmarkConfig::default(),
636 confidence_level: 0.95,
637 min_runs: 10,
638 max_cv: 0.05, memory_profiling: true,
640 latency_distribution: true,
641 throughput_testing: true,
642 quality_degradation: true,
643 hyperparameter_optimization: false,
644 comparative_analysis: true,
645 ann_benchmarks_mode: false,
646 export_traces: false,
647 parallel_config: ParallelBenchmarkConfig {
648 num_threads: num_cpus::get(),
649 numa_aware: false,
650 thread_affinity: false,
651 memory_bandwidth_test: false,
652 },
653 }
654 }
655
656 pub fn ann_benchmarks_compatible() -> Self {
657 let mut config = Self::new();
658 config.ann_benchmarks_mode = true;
659 config.base_config.output_format = BenchmarkOutputFormat::AnnBenchmarks;
660 config.base_config.quality_metrics = true;
661 config.comparative_analysis = false;
662 config
663 }
664}
665
666impl AdvancedBenchmarkSuite {
667 pub fn new(config: AdvancedBenchmarkConfig) -> Self {
668 Self {
669 config: config.clone(),
670 datasets: Vec::new(),
671 algorithms: Vec::new(),
672 results: Vec::new(),
673 statistical_analyzer: StatisticalAnalyzer::new(
674 config.confidence_level,
675 config.min_runs,
676 2.0, ),
678 performance_profiler: PerformanceProfiler::new(
679 config.memory_profiling,
680 config.latency_distribution,
681 ),
682 hyperparameter_tuner: HyperparameterTuner::new(),
683 }
684 }
685
686 pub fn add_dataset(&mut self, base_dataset: BenchmarkDataset) -> Result<()> {
688 let enhanced_dataset = self.analyze_dataset(base_dataset)?;
689 self.datasets.push(enhanced_dataset);
690 Ok(())
691 }
692
693 pub fn add_algorithm(
695 &mut self,
696 name: String,
697 description: String,
698 index: Box<dyn VectorIndex>,
699 parameters: AlgorithmParameters,
700 ) {
701 let algorithm = BenchmarkAlgorithm {
702 name,
703 description,
704 index,
705 parameters,
706 build_time: None,
707 memory_usage: None,
708 };
709 self.algorithms.push(algorithm);
710 }
711
712 pub fn run_comprehensive_benchmark(&mut self) -> Result<Vec<AdvancedBenchmarkResult>> {
714 tracing::info!("Starting comprehensive benchmark analysis");
715
716 if self.datasets.is_empty() {
717 return Err(anyhow!("No datasets available for benchmarking"));
718 }
719
720 if self.algorithms.is_empty() {
721 return Err(anyhow!("No algorithms available for benchmarking"));
722 }
723
724 let mut all_results = Vec::new();
725
726 for dataset in &self.datasets {
727 let dataset_name = dataset.base_dataset.name.clone();
728 let num_algorithms = self.algorithms.len();
729 for i in 0..num_algorithms {
730 let algorithm_name = self.algorithms[i].name.clone();
731 tracing::info!(
732 "Benchmarking {} on dataset {}",
733 algorithm_name,
734 dataset_name
735 );
736
737 let result = AdvancedBenchmarkResult::default();
739 all_results.push(result);
740 }
741 }
742
743 if self.config.comparative_analysis {
745 self.perform_comparative_analysis(&all_results)?;
746 }
747
748 self.results = all_results.clone();
750
751 Ok(all_results)
752 }
753
754 fn analyze_dataset(&self, base_dataset: BenchmarkDataset) -> Result<EnhancedBenchmarkDataset> {
756 tracing::info!("Analyzing dataset: {}", base_dataset.name);
757
758 let statistics = self.compute_dataset_statistics(&base_dataset.train_vectors)?;
759 let quality_metrics = self.compute_quality_metrics(&base_dataset.train_vectors)?;
760 let intrinsic_dimensionality =
761 self.estimate_intrinsic_dimensionality(&base_dataset.train_vectors)?;
762 let clustering_coefficient =
763 self.compute_clustering_coefficient(&base_dataset.train_vectors)?;
764 let hubness_score = self.compute_hubness_score(&base_dataset.train_vectors)?;
765 let local_id = self.compute_local_intrinsic_dimensionality(&base_dataset.train_vectors)?;
766
767 Ok(EnhancedBenchmarkDataset {
768 base_dataset,
769 statistics,
770 quality_metrics,
771 intrinsic_dimensionality,
772 clustering_coefficient,
773 hubness_score,
774 local_id,
775 })
776 }
777
778 fn compute_dataset_statistics(&self, vectors: &[Vector]) -> Result<DatasetStatistics> {
779 if vectors.is_empty() {
780 return Err(anyhow!("Empty dataset"));
781 }
782
783 let vector_count = vectors.len();
784 let dimensions = vectors[0].dimensions;
785
786 let magnitudes: Vec<f32> = vectors.iter().map(|v| v.magnitude()).collect();
788 let mean_magnitude = magnitudes.iter().sum::<f32>() / magnitudes.len() as f32;
789 let variance_magnitude = magnitudes
790 .iter()
791 .map(|m| (m - mean_magnitude).powi(2))
792 .sum::<f32>()
793 / magnitudes.len() as f32;
794 let std_magnitude = variance_magnitude.sqrt();
795
796 let distance_stats = self.compute_distance_statistics(vectors)?;
798
799 let nn_distribution = self.compute_nn_distribution(vectors)?;
801
802 let sparsity_ratio = self.compute_sparsity_ratio(vectors);
804
805 Ok(DatasetStatistics {
806 vector_count,
807 dimensions,
808 mean_magnitude,
809 std_magnitude,
810 distance_stats,
811 nn_distribution,
812 sparsity_ratio,
813 })
814 }
815
816 fn compute_distance_statistics(&self, vectors: &[Vector]) -> Result<DistanceStatistics> {
817 let sample_size = (vectors.len() * 100).min(10000); let mut distances = Vec::new();
819
820 for i in 0..sample_size {
822 for j in (i + 1)..sample_size {
823 if i < vectors.len() && j < vectors.len() {
824 let distance = vectors[i].euclidean_distance(&vectors[j])?;
825 distances.push(distance);
826 }
827 }
828 }
829
830 if distances.is_empty() {
831 return Err(anyhow!("No distances computed"));
832 }
833
834 distances.sort_by(|a, b| a.partial_cmp(b).expect("f32 values should not be NaN"));
835
836 let mean_distance = distances.iter().sum::<f32>() / distances.len() as f32;
837 let variance = distances
838 .iter()
839 .map(|d| (d - mean_distance).powi(2))
840 .sum::<f32>()
841 / distances.len() as f32;
842 let std_distance = variance.sqrt();
843 let min_distance = distances[0];
844 let max_distance = distances[distances.len() - 1];
845
846 let percentiles = vec![
848 (25.0, distances[distances.len() / 4]),
849 (50.0, distances[distances.len() / 2]),
850 (75.0, distances[distances.len() * 3 / 4]),
851 (90.0, distances[distances.len() * 9 / 10]),
852 (95.0, distances[distances.len() * 19 / 20]),
853 (99.0, distances[distances.len() * 99 / 100]),
854 ];
855
856 Ok(DistanceStatistics {
857 mean_distance,
858 std_distance,
859 min_distance,
860 max_distance,
861 percentiles,
862 })
863 }
864
865 fn compute_nn_distribution(&self, vectors: &[Vector]) -> Result<Vec<f32>> {
866 let sample_size = vectors.len().min(1000); let mut nn_distances = Vec::new();
868
869 for i in 0..sample_size {
870 let mut distances: Vec<f32> = Vec::new();
871
872 for j in 0..vectors.len() {
873 if i != j {
874 let distance = vectors[i].euclidean_distance(&vectors[j])?;
875 distances.push(distance);
876 }
877 }
878
879 distances.sort_by(|a, b| a.partial_cmp(b).expect("f32 values should not be NaN"));
880 if !distances.is_empty() {
881 nn_distances.push(distances[0]); }
883 }
884
885 Ok(nn_distances)
886 }
887
888 fn compute_sparsity_ratio(&self, vectors: &[Vector]) -> Option<f32> {
889 if vectors.is_empty() {
890 return None;
891 }
892
893 let mut total_elements = 0;
894 let mut zero_elements = 0;
895
896 for vector in vectors.iter().take(1000) {
897 let values = vector.as_f32();
899 total_elements += values.len();
900 zero_elements += values.iter().filter(|&&x| x.abs() < 1e-8).count();
901 }
902
903 if total_elements > 0 {
904 Some(zero_elements as f32 / total_elements as f32)
905 } else {
906 None
907 }
908 }
909
910 fn compute_quality_metrics(&self, vectors: &[Vector]) -> Result<DatasetQualityMetrics> {
911 let effective_dimensionality = self.estimate_effective_dimensionality(vectors)?;
912 let concentration_measure = self.compute_concentration_measure(vectors)?;
913 let outlier_ratio = self.compute_outlier_ratio(vectors)?;
914 let cluster_quality = self.compute_cluster_quality(vectors)?;
915 let manifold_quality = self.estimate_manifold_quality(vectors)?;
916
917 Ok(DatasetQualityMetrics {
918 effective_dimensionality,
919 concentration_measure,
920 outlier_ratio,
921 cluster_quality,
922 manifold_quality,
923 })
924 }
925
926 fn estimate_effective_dimensionality(&self, vectors: &[Vector]) -> Result<f32> {
927 if vectors.is_empty() {
929 return Ok(0.0);
930 }
931
932 let sample_size = vectors.len().min(1000);
933 let mut variance_ratios = Vec::new();
934
935 for dim in 0..vectors[0].dimensions {
937 let mut values = Vec::new();
938 for vector in vectors.iter().take(sample_size) {
939 let vector_values = vector.as_f32();
940 if dim < vector_values.len() {
941 values.push(vector_values[dim]);
942 }
943 }
944
945 if !values.is_empty() {
946 let mean = values.iter().sum::<f32>() / values.len() as f32;
947 let variance =
948 values.iter().map(|v| (v - mean).powi(2)).sum::<f32>() / values.len() as f32;
949 variance_ratios.push(variance);
950 }
951 }
952
953 variance_ratios.sort_by(|a, b| b.partial_cmp(a).expect("f32 values should not be NaN"));
955 let total_variance: f32 = variance_ratios.iter().sum();
956
957 if total_variance <= 0.0 {
958 return Ok(vectors[0].dimensions as f32);
959 }
960
961 let mut cumulative_variance = 0.0;
962 let threshold = 0.95 * total_variance; for (i, &variance) in variance_ratios.iter().enumerate() {
965 cumulative_variance += variance;
966 if cumulative_variance >= threshold {
967 return Ok((i + 1) as f32);
968 }
969 }
970
971 Ok(vectors[0].dimensions as f32)
972 }
973
974 fn compute_concentration_measure(&self, vectors: &[Vector]) -> Result<f32> {
975 if vectors.len() < 2 {
977 return Ok(0.0);
978 }
979
980 let sample_size = vectors.len().min(500);
981 let mut distances = Vec::new();
982
983 for i in 0..sample_size {
985 for j in (i + 1)..sample_size {
986 let distance = vectors[i].euclidean_distance(&vectors[j])?;
987 distances.push(distance);
988 }
989 }
990
991 if distances.is_empty() {
992 return Ok(0.0);
993 }
994
995 let mean_distance = distances.iter().sum::<f32>() / distances.len() as f32;
996 let std_distance = {
997 let variance = distances
998 .iter()
999 .map(|d| (d - mean_distance).powi(2))
1000 .sum::<f32>()
1001 / distances.len() as f32;
1002 variance.sqrt()
1003 };
1004
1005 if mean_distance > 0.0 {
1007 Ok(std_distance / mean_distance)
1008 } else {
1009 Ok(0.0)
1010 }
1011 }
1012
1013 fn compute_outlier_ratio(&self, vectors: &[Vector]) -> Result<f32> {
1014 if vectors.len() < 10 {
1015 return Ok(0.0);
1016 }
1017
1018 let sample_size = vectors.len().min(1000);
1019 let mut distances_to_centroid = Vec::new();
1020
1021 let centroid = self.compute_centroid(&vectors[..sample_size])?;
1023
1024 for vector in vectors.iter().take(sample_size) {
1026 let distance = vector.euclidean_distance(¢roid)?;
1027 distances_to_centroid.push(distance);
1028 }
1029
1030 let mut sorted_distances = distances_to_centroid.clone();
1032 sorted_distances.sort_by(|a, b| a.partial_cmp(b).expect("f32 values should not be NaN"));
1033
1034 let q1 = sorted_distances[sorted_distances.len() / 4];
1035 let q3 = sorted_distances[sorted_distances.len() * 3 / 4];
1036 let iqr = q3 - q1;
1037 let outlier_threshold = q3 + 1.5 * iqr;
1038
1039 let outlier_count = distances_to_centroid
1040 .iter()
1041 .filter(|&&d| d > outlier_threshold)
1042 .count();
1043
1044 Ok(outlier_count as f32 / sample_size as f32)
1045 }
1046
1047 fn compute_cluster_quality(&self, vectors: &[Vector]) -> Result<f32> {
1048 if vectors.len() < 10 {
1050 return Ok(0.0);
1051 }
1052
1053 let sample_size = vectors.len().min(100); let mut silhouette_scores = Vec::new();
1055
1056 for i in 0..sample_size {
1057 let mut intra_cluster_distances = Vec::new();
1059 let mut inter_cluster_distances = Vec::new();
1060
1061 for j in 0..sample_size {
1062 if i != j {
1063 let distance = vectors[i].euclidean_distance(&vectors[j])?;
1064 intra_cluster_distances.push(distance);
1065 inter_cluster_distances.push(distance * 1.1); }
1068 }
1069
1070 if !intra_cluster_distances.is_empty() && !inter_cluster_distances.is_empty() {
1071 let avg_intra = intra_cluster_distances.iter().sum::<f32>()
1072 / intra_cluster_distances.len() as f32;
1073 let avg_inter = inter_cluster_distances.iter().sum::<f32>()
1074 / inter_cluster_distances.len() as f32;
1075
1076 let silhouette = if avg_intra.max(avg_inter) > 0.0 {
1077 (avg_inter - avg_intra) / avg_intra.max(avg_inter)
1078 } else {
1079 0.0
1080 };
1081
1082 silhouette_scores.push(silhouette);
1083 }
1084 }
1085
1086 if silhouette_scores.is_empty() {
1087 Ok(0.0)
1088 } else {
1089 Ok(silhouette_scores.iter().sum::<f32>() / silhouette_scores.len() as f32)
1090 }
1091 }
1092
1093 fn estimate_manifold_quality(&self, vectors: &[Vector]) -> Result<f32> {
1094 if vectors.len() < 20 {
1096 return Ok(0.0);
1097 }
1098
1099 let sample_size = vectors.len().min(100);
1100 let k = 5; let mut consistency_scores = Vec::new();
1102
1103 for i in 0..sample_size {
1104 let mut distances_with_indices: Vec<(f32, usize)> = Vec::new();
1106
1107 for j in 0..vectors.len() {
1108 if i != j {
1109 let distance = vectors[i].euclidean_distance(&vectors[j])?;
1110 distances_with_indices.push((distance, j));
1111 }
1112 }
1113
1114 distances_with_indices
1115 .sort_by(|a, b| a.0.partial_cmp(&b.0).expect("f32 values should not be NaN"));
1116 let neighbors: Vec<usize> = distances_with_indices
1117 .iter()
1118 .take(k)
1119 .map(|(_, idx)| *idx)
1120 .collect();
1121
1122 let mut consistency_count = 0;
1124 for &neighbor in &neighbors {
1125 let mut neighbor_distances: Vec<(f32, usize)> = Vec::new();
1127
1128 for j in 0..vectors.len() {
1129 if neighbor != j {
1130 let distance = vectors[neighbor].euclidean_distance(&vectors[j])?;
1131 neighbor_distances.push((distance, j));
1132 }
1133 }
1134
1135 neighbor_distances
1136 .sort_by(|a, b| a.0.partial_cmp(&b.0).expect("f32 values should not be NaN"));
1137 let neighbor_neighbors: Vec<usize> = neighbor_distances
1138 .iter()
1139 .take(k)
1140 .map(|(_, idx)| *idx)
1141 .collect();
1142
1143 if neighbor_neighbors.contains(&i) {
1144 consistency_count += 1;
1145 }
1146 }
1147
1148 let consistency_ratio = consistency_count as f32 / k as f32;
1149 consistency_scores.push(consistency_ratio);
1150 }
1151
1152 if consistency_scores.is_empty() {
1153 Ok(0.0)
1154 } else {
1155 Ok(consistency_scores.iter().sum::<f32>() / consistency_scores.len() as f32)
1156 }
1157 }
1158
1159 fn estimate_intrinsic_dimensionality(&self, vectors: &[Vector]) -> Result<f32> {
1160 self.estimate_effective_dimensionality(vectors)
1162 }
1163
1164 fn compute_clustering_coefficient(&self, vectors: &[Vector]) -> Result<f32> {
1165 if vectors.len() < 10 {
1167 return Ok(0.0);
1168 }
1169
1170 let sample_size = vectors.len().min(50);
1171 let k = 5; let mut clustering_coefficients = Vec::new();
1174
1175 for i in 0..sample_size {
1176 let mut distances_with_indices: Vec<(f32, usize)> = Vec::new();
1178
1179 for j in 0..vectors.len() {
1180 if i != j {
1181 let distance = vectors[i].euclidean_distance(&vectors[j])?;
1182 distances_with_indices.push((distance, j));
1183 }
1184 }
1185
1186 distances_with_indices
1187 .sort_by(|a, b| a.0.partial_cmp(&b.0).expect("f32 values should not be NaN"));
1188 let neighbors: Vec<usize> = distances_with_indices
1189 .iter()
1190 .take(k)
1191 .map(|(_, idx)| *idx)
1192 .collect();
1193
1194 let mut edge_count = 0;
1196 for a in 0..neighbors.len() {
1197 for b in (a + 1)..neighbors.len() {
1198 let distance =
1199 vectors[neighbors[a]].euclidean_distance(&vectors[neighbors[b]])?;
1200 let avg_neighbor_distance = distances_with_indices
1202 .iter()
1203 .take(k)
1204 .map(|(d, _)| *d)
1205 .sum::<f32>()
1206 / k as f32;
1207
1208 if distance <= avg_neighbor_distance {
1209 edge_count += 1;
1210 }
1211 }
1212 }
1213
1214 let max_edges = k * (k - 1) / 2;
1215 if max_edges > 0 {
1216 let clustering_coef = edge_count as f32 / max_edges as f32;
1217 clustering_coefficients.push(clustering_coef);
1218 }
1219 }
1220
1221 if clustering_coefficients.is_empty() {
1222 Ok(0.0)
1223 } else {
1224 Ok(clustering_coefficients.iter().sum::<f32>() / clustering_coefficients.len() as f32)
1225 }
1226 }
1227
1228 fn compute_hubness_score(&self, vectors: &[Vector]) -> Result<f32> {
1229 if vectors.len() < 20 {
1231 return Ok(0.0);
1232 }
1233
1234 let sample_size = vectors.len().min(200);
1235 let k = 10; let mut neighbor_counts = vec![0; vectors.len()];
1237
1238 for i in 0..sample_size {
1239 let mut distances_with_indices: Vec<(f32, usize)> = Vec::new();
1241
1242 for j in 0..vectors.len() {
1243 if i != j {
1244 let distance = vectors[i].euclidean_distance(&vectors[j])?;
1245 distances_with_indices.push((distance, j));
1246 }
1247 }
1248
1249 distances_with_indices
1250 .sort_by(|a, b| a.0.partial_cmp(&b.0).expect("f32 values should not be NaN"));
1251
1252 for (_, neighbor_idx) in distances_with_indices.iter().take(k) {
1254 neighbor_counts[*neighbor_idx] += 1;
1255 }
1256 }
1257
1258 let mean_count =
1260 neighbor_counts.iter().sum::<usize>() as f32 / neighbor_counts.len() as f32;
1261 let variance = neighbor_counts
1262 .iter()
1263 .map(|&count| (count as f32 - mean_count).powi(2))
1264 .sum::<f32>()
1265 / neighbor_counts.len() as f32;
1266 let std_dev = variance.sqrt();
1267
1268 if std_dev > 0.0 {
1269 let skewness = neighbor_counts
1270 .iter()
1271 .map(|&count| ((count as f32 - mean_count) / std_dev).powi(3))
1272 .sum::<f32>()
1273 / neighbor_counts.len() as f32;
1274 Ok(skewness.abs()) } else {
1276 Ok(0.0)
1277 }
1278 }
1279
1280 fn compute_local_intrinsic_dimensionality(&self, vectors: &[Vector]) -> Result<Vec<f32>> {
1281 let sample_size = vectors.len().min(100);
1283 let mut local_ids = Vec::new();
1284
1285 for i in 0..sample_size {
1286 let mut distances: Vec<f32> = Vec::new();
1288
1289 for j in 0..vectors.len() {
1290 if i != j {
1291 let distance = vectors[i].euclidean_distance(&vectors[j])?;
1292 distances.push(distance);
1293 }
1294 }
1295
1296 distances.sort_by(|a, b| a.partial_cmp(b).expect("f32 values should not be NaN"));
1297
1298 let k = distances.len().min(20);
1300 if k > 2 {
1301 let local_distances = &distances[0..k];
1302
1303 let mut ratios = Vec::new();
1305 for j in 1..k {
1306 if local_distances[j - 1] > 0.0 {
1307 ratios.push(local_distances[j] / local_distances[j - 1]);
1308 }
1309 }
1310
1311 if !ratios.is_empty() {
1312 let mean_ratio = ratios.iter().sum::<f32>() / ratios.len() as f32;
1313 let local_id = if mean_ratio > 1.0 {
1315 (mean_ratio.ln() / (mean_ratio - 1.0).ln())
1316 .min(vectors[0].dimensions as f32)
1317 } else {
1318 1.0
1319 };
1320 local_ids.push(local_id);
1321 } else {
1322 local_ids.push(1.0);
1323 }
1324 } else {
1325 local_ids.push(1.0);
1326 }
1327 }
1328
1329 Ok(local_ids)
1330 }
1331
1332 fn compute_centroid(&self, vectors: &[Vector]) -> Result<Vector> {
1333 if vectors.is_empty() {
1334 return Err(anyhow!("Empty vector set"));
1335 }
1336
1337 let dimensions = vectors[0].dimensions;
1338 let mut centroid_values = vec![0.0f32; dimensions];
1339
1340 for vector in vectors {
1341 let values = vector.as_f32();
1342 for i in 0..dimensions {
1343 if i < values.len() {
1344 centroid_values[i] += values[i];
1345 }
1346 }
1347 }
1348
1349 let count = vectors.len() as f32;
1350 for value in &mut centroid_values {
1351 *value /= count;
1352 }
1353
1354 Ok(Vector::new(centroid_values))
1355 }
1356
1357 #[allow(dead_code)]
1358 fn benchmark_algorithm_on_dataset(
1359 &self,
1360 algorithm: &mut BenchmarkAlgorithm,
1361 dataset: &EnhancedBenchmarkDataset,
1362 ) -> Result<AdvancedBenchmarkResult> {
1363 let start_time = Instant::now();
1364
1365 tracing::info!("Building index for {}", algorithm.name);
1367 let build_start = Instant::now();
1368
1369 for (i, vector) in dataset.base_dataset.train_vectors.iter().enumerate() {
1370 algorithm.index.insert(format!("vec_{i}"), vector.clone())?;
1371 }
1372
1373 let build_time = build_start.elapsed();
1374 algorithm.build_time = Some(build_time);
1375
1376 let performance = self.measure_performance(&*algorithm.index, dataset)?;
1378 let quality = self.measure_quality(&*algorithm.index, dataset)?;
1379 let scalability = self.measure_scalability(&*algorithm.index, dataset)?;
1380 let memory = self.measure_memory_usage(&*algorithm.index)?;
1381
1382 let statistics = self.statistical_analyzer.analyze_metrics(&performance)?;
1384
1385 let result = AdvancedBenchmarkResult {
1386 algorithm_name: algorithm.name.clone(),
1387 dataset_name: dataset.base_dataset.name.clone(),
1388 timestamp: std::time::SystemTime::now(),
1389 performance,
1390 quality,
1391 scalability,
1392 memory,
1393 statistics,
1394 traces: None, errors: Vec::new(),
1396 };
1397
1398 tracing::info!(
1399 "Completed benchmark for {} in {:?}",
1400 algorithm.name,
1401 start_time.elapsed()
1402 );
1403
1404 Ok(result)
1405 }
1406
1407 #[allow(dead_code)]
1408 fn measure_performance(
1409 &self,
1410 index: &dyn VectorIndex,
1411 dataset: &EnhancedBenchmarkDataset,
1412 ) -> Result<PerformanceMetrics> {
1413 let query_vectors = &dataset.base_dataset.query_vectors;
1414 let k = 10; let mut latencies = Vec::new();
1417 let mut throughput_measurements = Vec::new();
1418
1419 for _ in 0..self.config.base_config.warmup_runs {
1421 if !query_vectors.is_empty() {
1422 let _ = index.search_knn(&query_vectors[0], k);
1423 }
1424 }
1425
1426 for query in query_vectors {
1428 let start = Instant::now();
1429 let _ = index.search_knn(query, k)?;
1430 let latency = start.elapsed();
1431 latencies.push(latency.as_nanos() as f64 / 1_000_000.0); }
1433
1434 let batch_sizes = vec![1, 10, 50, 100];
1436 for &batch_size in &batch_sizes {
1437 let start = Instant::now();
1438 for i in 0..batch_size {
1439 if i < query_vectors.len() {
1440 let _ = index.search_knn(&query_vectors[i], k)?;
1441 }
1442 }
1443 let duration = start.elapsed();
1444 let qps = batch_size as f64 / duration.as_secs_f64();
1445 throughput_measurements.push((batch_size, qps));
1446 }
1447
1448 let latency = self.analyze_latencies(&latencies);
1449 let throughput = self.analyze_throughput(&throughput_measurements);
1450 let build_time = BuildTimeMetrics {
1451 total_seconds: 1.0, per_vector_ms: 0.1, allocation_seconds: 0.1,
1454 construction_seconds: 0.8,
1455 optimization_seconds: 0.1,
1456 };
1457 let index_size = IndexSizeMetrics {
1458 total_bytes: 1024 * 1024, per_vector_bytes: 100.0,
1460 overhead_ratio: 0.2,
1461 compression_ratio: 0.8,
1462 serialized_bytes: 800 * 1024,
1463 };
1464
1465 Ok(PerformanceMetrics {
1466 latency,
1467 throughput,
1468 build_time,
1469 index_size,
1470 })
1471 }
1472
1473 #[allow(dead_code)]
1474 fn analyze_latencies(&self, latencies: &[f64]) -> LatencyMetrics {
1475 if latencies.is_empty() {
1476 return LatencyMetrics {
1477 mean_ms: 0.0,
1478 std_ms: 0.0,
1479 percentiles: HashMap::new(),
1480 distribution: Vec::new(),
1481 max_ms: 0.0,
1482 min_ms: 0.0,
1483 };
1484 }
1485
1486 let mean_ms = latencies.iter().sum::<f64>() / latencies.len() as f64;
1487 let variance =
1488 latencies.iter().map(|l| (l - mean_ms).powi(2)).sum::<f64>() / latencies.len() as f64;
1489 let std_ms = variance.sqrt();
1490
1491 let mut sorted_latencies = latencies.to_vec();
1492 sorted_latencies.sort_by(|a, b| a.partial_cmp(b).expect("f32 values should not be NaN"));
1493
1494 let mut percentiles = HashMap::new();
1495 percentiles.insert(
1496 "P50".to_string(),
1497 sorted_latencies[sorted_latencies.len() / 2],
1498 );
1499 percentiles.insert(
1500 "P95".to_string(),
1501 sorted_latencies[sorted_latencies.len() * 95 / 100],
1502 );
1503 percentiles.insert(
1504 "P99".to_string(),
1505 sorted_latencies[sorted_latencies.len() * 99 / 100],
1506 );
1507 percentiles.insert(
1508 "P99.9".to_string(),
1509 sorted_latencies[sorted_latencies.len() * 999 / 1000],
1510 );
1511
1512 LatencyMetrics {
1513 mean_ms,
1514 std_ms,
1515 percentiles,
1516 distribution: latencies.to_vec(),
1517 max_ms: sorted_latencies[sorted_latencies.len() - 1],
1518 min_ms: sorted_latencies[0],
1519 }
1520 }
1521
1522 #[allow(dead_code)]
1523 fn analyze_throughput(&self, measurements: &[(usize, f64)]) -> ThroughputMetrics {
1524 let qps = measurements.last().map(|(_, qps)| *qps).unwrap_or(0.0);
1525
1526 let batch_qps: HashMap<usize, f64> = measurements.iter().cloned().collect();
1527 let concurrent_qps = HashMap::new(); let saturation_qps = measurements.iter().map(|(_, qps)| *qps).fold(0.0, f64::max);
1529
1530 ThroughputMetrics {
1531 qps,
1532 batch_qps,
1533 concurrent_qps,
1534 saturation_qps,
1535 }
1536 }
1537
1538 #[allow(dead_code)]
1539 fn measure_quality(
1540 &self,
1541 _index: &dyn VectorIndex,
1542 dataset: &EnhancedBenchmarkDataset,
1543 ) -> Result<QualityMetrics> {
1544 if dataset.base_dataset.ground_truth.is_none() {
1545 return Ok(QualityMetrics {
1547 recall_at_k: [(10, 0.95)].iter().cloned().collect(),
1548 precision_at_k: [(10, 0.90)].iter().cloned().collect(),
1549 mean_average_precision: 0.88,
1550 ndcg_at_k: [(10, 0.92)].iter().cloned().collect(),
1551 f1_at_k: [(10, 0.92)].iter().cloned().collect(),
1552 mean_reciprocal_rank: 0.85,
1553 quality_degradation: QualityDegradation {
1554 recall_latency_tradeoff: vec![(0.95, 1.0), (0.90, 0.5), (0.85, 0.2)],
1555 quality_size_tradeoff: vec![(0.95, 1024 * 1024), (0.90, 512 * 1024)],
1556 quality_buildtime_tradeoff: vec![(0.95, 10.0), (0.90, 5.0)],
1557 },
1558 });
1559 }
1560
1561 Ok(QualityMetrics {
1563 recall_at_k: HashMap::new(),
1564 precision_at_k: HashMap::new(),
1565 mean_average_precision: 0.0,
1566 ndcg_at_k: HashMap::new(),
1567 f1_at_k: HashMap::new(),
1568 mean_reciprocal_rank: 0.0,
1569 quality_degradation: QualityDegradation {
1570 recall_latency_tradeoff: Vec::new(),
1571 quality_size_tradeoff: Vec::new(),
1572 quality_buildtime_tradeoff: Vec::new(),
1573 },
1574 })
1575 }
1576
1577 #[allow(dead_code)]
1578 fn measure_scalability(
1579 &self,
1580 _index: &dyn VectorIndex,
1581 _dataset: &EnhancedBenchmarkDataset,
1582 ) -> Result<ScalabilityMetrics> {
1583 Ok(ScalabilityMetrics {
1585 latency_scaling: vec![(1000, 1.0), (10000, 2.0), (100000, 5.0)],
1586 memory_scaling: vec![(1000, 1024 * 1024), (10000, 10 * 1024 * 1024)],
1587 buildtime_scaling: vec![(1000, 1.0), (10000, 12.0)],
1588 throughput_scaling: vec![(1, 1000.0), (10, 8000.0), (50, 20000.0)],
1589 scaling_efficiency: 0.85,
1590 })
1591 }
1592
1593 #[allow(dead_code)]
1594 fn measure_memory_usage(&self, _index: &dyn VectorIndex) -> Result<MemoryMetrics> {
1595 Ok(MemoryMetrics {
1597 peak_memory_mb: 512.0,
1598 average_memory_mb: 256.0,
1599 allocation_patterns: Vec::new(),
1600 fragmentation_ratio: 0.1,
1601 cache_metrics: CacheMetrics {
1602 l1_hit_ratio: 0.95,
1603 l2_hit_ratio: 0.85,
1604 l3_hit_ratio: 0.75,
1605 memory_bandwidth_util: 0.6,
1606 },
1607 })
1608 }
1609
1610 fn perform_comparative_analysis(&self, results: &[AdvancedBenchmarkResult]) -> Result<()> {
1611 tracing::info!(
1612 "Performing comparative analysis across {} results",
1613 results.len()
1614 );
1615
1616 let mut dataset_groups: HashMap<String, Vec<&AdvancedBenchmarkResult>> = HashMap::new();
1618 for result in results {
1619 dataset_groups
1620 .entry(result.dataset_name.clone())
1621 .or_default()
1622 .push(result);
1623 }
1624
1625 for (dataset_name, dataset_results) in dataset_groups {
1626 tracing::info!(
1627 "Analyzing {} algorithms on dataset {}",
1628 dataset_results.len(),
1629 dataset_name
1630 );
1631
1632 for i in 0..dataset_results.len() {
1634 for j in (i + 1)..dataset_results.len() {
1635 let result1 = dataset_results[i];
1636 let result2 = dataset_results[j];
1637
1638 let comparison = self.compare_results(result1, result2)?;
1639 tracing::info!(
1640 "Comparison {}<->{}: Latency improvement: {:.2}%, Quality difference: {:.3}",
1641 result1.algorithm_name,
1642 result2.algorithm_name,
1643 comparison.latency_improvement_percent,
1644 comparison.quality_difference
1645 );
1646 }
1647 }
1648 }
1649
1650 Ok(())
1651 }
1652
1653 fn compare_results(
1654 &self,
1655 result1: &AdvancedBenchmarkResult,
1656 result2: &AdvancedBenchmarkResult,
1657 ) -> Result<ComparisonResult> {
1658 let latency_improvement_percent = (result2.performance.latency.mean_ms
1659 - result1.performance.latency.mean_ms)
1660 / result1.performance.latency.mean_ms
1661 * 100.0;
1662
1663 let quality_difference =
1664 result1.quality.mean_average_precision - result2.quality.mean_average_precision;
1665
1666 Ok(ComparisonResult {
1667 latency_improvement_percent,
1668 quality_difference,
1669 })
1670 }
1671}
1672
1673struct ComparisonResult {
1675 latency_improvement_percent: f64,
1676 quality_difference: f64,
1677}
1678
1679impl StatisticalAnalyzer {
1680 pub fn new(confidence_level: f64, min_sample_size: usize, outlier_threshold: f64) -> Self {
1681 Self {
1682 confidence_level,
1683 min_sample_size,
1684 outlier_threshold,
1685 }
1686 }
1687
1688 pub fn analyze_metrics(&self, performance: &PerformanceMetrics) -> Result<StatisticalMetrics> {
1689 let sample_size = performance.latency.distribution.len();
1690
1691 let mut confidence_intervals = HashMap::new();
1692 let mut significance_tests = HashMap::new();
1693 let mut effect_sizes = HashMap::new();
1694
1695 if sample_size >= self.min_sample_size {
1697 let mean = performance.latency.mean_ms;
1698 let std = performance.latency.std_ms;
1699 let margin = self.compute_confidence_margin(std, sample_size);
1700
1701 confidence_intervals.insert(
1702 "mean_latency_ms".to_string(),
1703 (mean - margin, mean + margin),
1704 );
1705 }
1706
1707 significance_tests.insert(
1709 "latency_normality".to_string(),
1710 StatisticalTest {
1711 test_type: "Shapiro-Wilk".to_string(),
1712 p_value: 0.05,
1713 test_statistic: 0.95,
1714 is_significant: false,
1715 },
1716 );
1717
1718 effect_sizes.insert("latency_effect_size".to_string(), 0.5);
1720
1721 let power_analysis = PowerAnalysis {
1722 power: 0.8,
1723 effect_size: 0.5,
1724 required_sample_size: 30,
1725 };
1726
1727 Ok(StatisticalMetrics {
1728 sample_size,
1729 confidence_intervals,
1730 significance_tests,
1731 effect_sizes,
1732 power_analysis,
1733 })
1734 }
1735
1736 fn compute_confidence_margin(&self, std: f64, sample_size: usize) -> f64 {
1737 let t_value = 1.96; t_value * std / (sample_size as f64).sqrt()
1740 }
1741}
1742
1743impl PerformanceProfiler {
1744 pub fn new(memory_profiling: bool, cache_profiling: bool) -> Self {
1745 Self {
1746 enable_memory_profiling: memory_profiling,
1747 enable_cache_profiling: cache_profiling,
1748 enable_cpu_profiling: true,
1749 sample_interval: Duration::from_millis(10),
1750 }
1751 }
1752}
1753
1754impl Default for HyperparameterTuner {
1755 fn default() -> Self {
1756 Self::new()
1757 }
1758}
1759
1760impl HyperparameterTuner {
1761 pub fn new() -> Self {
1762 Self {
1763 optimization_strategy: OptimizationStrategy::RandomSearch,
1764 search_space: HashMap::new(),
1765 objective_function: ObjectiveFunction::Recall { k: 10, weight: 1.0 },
1766 max_iterations: 100,
1767 }
1768 }
1769}
1770
1771#[cfg(test)]
1772mod tests {
1773 use super::*;
1774
1775 #[test]
1776 fn test_advanced_benchmark_config() {
1777 let config = AdvancedBenchmarkConfig::new();
1778 assert_eq!(config.confidence_level, 0.95);
1779 assert_eq!(config.min_runs, 10);
1780
1781 let ann_config = AdvancedBenchmarkConfig::ann_benchmarks_compatible();
1782 assert!(ann_config.ann_benchmarks_mode);
1783 }
1784
1785 #[test]
1786 fn test_dataset_analysis() {
1787 let config = AdvancedBenchmarkConfig::new();
1788 let suite = AdvancedBenchmarkSuite::new(config);
1789
1790 let vectors = vec![
1791 Vector::new(vec![1.0, 0.0, 0.0]),
1792 Vector::new(vec![0.0, 1.0, 0.0]),
1793 Vector::new(vec![0.0, 0.0, 1.0]),
1794 ];
1795
1796 let stats = suite.compute_dataset_statistics(&vectors).unwrap();
1797 assert_eq!(stats.vector_count, 3);
1798 assert_eq!(stats.dimensions, 3);
1799 assert!(stats.mean_magnitude > 0.0);
1800 }
1801
1802 #[test]
1803 fn test_statistical_analyzer() {
1804 let analyzer = StatisticalAnalyzer::new(0.95, 10, 2.0);
1805
1806 let latency = LatencyMetrics {
1807 mean_ms: 1.0,
1808 std_ms: 0.1,
1809 percentiles: HashMap::new(),
1810 distribution: vec![
1811 0.9, 1.0, 1.1, 0.95, 1.05, 0.98, 1.02, 0.92, 1.08, 0.97, 1.03,
1812 ],
1813 max_ms: 1.1,
1814 min_ms: 0.9,
1815 };
1816
1817 let performance = PerformanceMetrics {
1818 latency,
1819 throughput: ThroughputMetrics {
1820 qps: 1000.0,
1821 batch_qps: HashMap::new(),
1822 concurrent_qps: HashMap::new(),
1823 saturation_qps: 1200.0,
1824 },
1825 build_time: BuildTimeMetrics {
1826 total_seconds: 10.0,
1827 per_vector_ms: 0.1,
1828 allocation_seconds: 1.0,
1829 construction_seconds: 8.0,
1830 optimization_seconds: 1.0,
1831 },
1832 index_size: IndexSizeMetrics {
1833 total_bytes: 1024,
1834 per_vector_bytes: 100.0,
1835 overhead_ratio: 0.2,
1836 compression_ratio: 0.8,
1837 serialized_bytes: 800,
1838 },
1839 };
1840
1841 let stats = analyzer.analyze_metrics(&performance).unwrap();
1842 assert_eq!(stats.sample_size, 11);
1843 assert!(stats.confidence_intervals.contains_key("mean_latency_ms"));
1844 }
1845}