use std::collections::HashMap;
pub type QueryRecord<'a> = (i64, &'a [u8], Option<&'a [u8]>);
#[derive(Debug, Clone)]
pub struct IndexMetadata {
pub k: usize,
pub w: usize,
pub salt: u64,
pub bucket_names: HashMap<u32, String>,
pub bucket_sources: HashMap<u32, Vec<String>>,
pub bucket_minimizer_counts: HashMap<u32, usize>,
pub largest_shard_entries: u64,
pub bucket_file_stats: Option<HashMap<u32, BucketFileStats>>,
}
#[derive(Debug, Clone, PartialEq)]
pub struct BucketFileStats {
pub mean: f64,
pub median: f64,
pub stdev: f64,
pub min: f64,
pub max: f64,
}
impl BucketFileStats {
pub fn from_file_lengths(lengths: &[u64]) -> Option<Self> {
if lengths.is_empty() {
return None;
}
let n = lengths.len() as f64;
let sum: f64 = lengths.iter().map(|&v| v as f64).sum();
let mean = sum / n;
let variance = lengths
.iter()
.map(|&v| {
let diff = v as f64 - mean;
diff * diff
})
.sum::<f64>()
/ n;
let stdev = variance.sqrt();
let mut sorted = lengths.to_vec();
sorted.sort_unstable();
let median = if sorted.len() % 2 == 0 {
let mid = sorted.len() / 2;
(sorted[mid - 1] as f64 + sorted[mid] as f64) / 2.0
} else {
sorted[sorted.len() / 2] as f64
};
let min = *lengths.iter().min().unwrap() as f64;
let max = *lengths.iter().max().unwrap() as f64;
Some(BucketFileStats {
mean,
median,
stdev,
min,
max,
})
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct HitResult {
pub query_id: i64,
pub bucket_id: u32,
pub score: f64,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_from_file_lengths_single() {
let stats = BucketFileStats::from_file_lengths(&[1000]).unwrap();
assert_eq!(stats.mean, 1000.0);
assert_eq!(stats.median, 1000.0);
assert_eq!(stats.stdev, 0.0);
assert_eq!(stats.min, 1000.0);
assert_eq!(stats.max, 1000.0);
}
#[test]
fn test_from_file_lengths_multiple_odd() {
let stats = BucketFileStats::from_file_lengths(&[100, 200, 600]).unwrap();
assert!((stats.mean - 300.0).abs() < 1e-9);
assert!((stats.median - 200.0).abs() < 1e-9);
let expected_stdev = (46666.666666666666_f64).sqrt();
assert!((stats.stdev - expected_stdev).abs() < 1e-6);
assert_eq!(stats.min, 100.0);
assert_eq!(stats.max, 600.0);
}
#[test]
fn test_from_file_lengths_multiple_even() {
let stats = BucketFileStats::from_file_lengths(&[10, 20, 30, 40]).unwrap();
assert!((stats.mean - 25.0).abs() < 1e-9);
assert!((stats.median - 25.0).abs() < 1e-9);
assert!((stats.stdev - 11.180339887).abs() < 1e-6);
assert_eq!(stats.min, 10.0);
assert_eq!(stats.max, 40.0);
}
#[test]
fn test_from_file_lengths_empty() {
assert!(BucketFileStats::from_file_lengths(&[]).is_none());
}
#[test]
fn test_from_file_lengths_identical() {
let stats = BucketFileStats::from_file_lengths(&[500, 500, 500]).unwrap();
assert_eq!(stats.mean, 500.0);
assert_eq!(stats.median, 500.0);
assert_eq!(stats.stdev, 0.0);
assert_eq!(stats.min, 500.0);
assert_eq!(stats.max, 500.0);
}
}