use crate::distance;
#[derive(Debug, Clone, Copy)]
pub struct LidEstimate {
pub lid: f32,
pub k: usize,
pub max_dist: f32,
}
#[derive(Debug, Clone)]
pub struct LidConfig {
pub k: usize,
pub epsilon: f32,
}
impl Default for LidConfig {
fn default() -> Self {
Self {
k: 20,
epsilon: 1e-10,
}
}
}
#[must_use]
pub fn estimate_lid_mle(sorted_distances: &[f32], config: &LidConfig) -> LidEstimate {
let k = sorted_distances.len().min(config.k);
if k < 2 {
return LidEstimate {
lid: f32::NAN,
k,
max_dist: sorted_distances.first().copied().unwrap_or(0.0),
};
}
let d_k = sorted_distances[k - 1];
let abs_epsilon = d_k * config.epsilon;
let d_k = d_k.max(abs_epsilon);
let mut sum_log = 0.0f32;
let mut valid_count = 0;
for &d_i in &sorted_distances[..k] {
let d_i = d_i.max(abs_epsilon);
let ratio = d_i / d_k;
if ratio > 0.0 && ratio < 1.0 {
sum_log += ratio.ln();
valid_count += 1;
}
}
let lid = if valid_count > 0 && sum_log.abs() > abs_epsilon {
-(valid_count as f32) / sum_log
} else {
f32::INFINITY
};
LidEstimate {
lid,
k,
max_dist: d_k,
}
}
#[must_use]
pub fn estimate_lid(neighbor_distances: &[f32], config: &LidConfig) -> LidEstimate {
let mut sorted = neighbor_distances.to_vec();
sorted.sort_unstable_by(|a, b| a.total_cmp(b));
estimate_lid_mle(&sorted, config)
}
#[must_use]
pub fn estimate_twonn(mu_ratios: &[f32], discard_fraction: f32) -> f32 {
if mu_ratios.is_empty() {
return f32::NAN;
}
let mut sorted: Vec<f32> = mu_ratios
.iter()
.filter(|&&x| x.is_finite() && x >= 1.0)
.copied()
.collect();
if sorted.len() < 2 {
return f32::NAN;
}
sorted.sort_unstable_by(|a, b| a.total_cmp(b));
let keep_count = ((sorted.len() as f32) * (1.0 - discard_fraction)).max(2.0) as usize;
let sorted = &sorted[..keep_count.min(sorted.len())];
let n_kept = sorted.len();
if n_kept < 2 {
return f32::NAN;
}
let mut sum_xy = 0.0f64; let mut sum_xx = 0.0f64;
let mut valid_count = 0usize;
for (i, &mu) in sorted.iter().enumerate() {
let f_emp = (i + 1) as f64 / n_kept as f64;
if f_emp >= 0.9999 || mu < 1.0001 {
continue;
}
let x = (mu as f64).ln();
let y = -(1.0 - f_emp).ln();
if x.is_finite() && y.is_finite() && x > 0.0 {
sum_xy += x * y;
sum_xx += x * x;
valid_count += 1;
}
}
if valid_count < 2 || sum_xx.abs() < 1e-12 {
return f32::NAN;
}
(sum_xy / sum_xx) as f32 }
#[derive(Debug, Clone, Copy)]
pub struct TwoNNResult {
pub dimension: f32,
pub std_error: f32,
pub n_samples: usize,
pub ci_lower: f32,
pub ci_upper: f32,
}
#[must_use]
pub fn estimate_twonn_with_ci(mu_ratios: &[f32], discard_fraction: f32) -> TwoNNResult {
let mut sorted: Vec<f32> = mu_ratios
.iter()
.filter(|&&x| x.is_finite() && x >= 1.0)
.copied()
.collect();
if sorted.len() < 2 {
return TwoNNResult {
dimension: f32::NAN,
std_error: f32::NAN,
n_samples: 0,
ci_lower: f32::NAN,
ci_upper: f32::NAN,
};
}
sorted.sort_unstable_by(|a, b| a.total_cmp(b));
let keep_count = ((sorted.len() as f32) * (1.0 - discard_fraction)).max(2.0) as usize;
let sorted = &sorted[..keep_count.min(sorted.len())];
let valid_ratios: Vec<f32> = sorted.iter().filter(|&&mu| mu > 1.0001).copied().collect();
let n = valid_ratios.len();
if n < 2 {
return TwoNNResult {
dimension: f32::NAN,
std_error: f32::NAN,
n_samples: n,
ci_lower: f32::NAN,
ci_upper: f32::NAN,
};
}
let sum_log_mu: f64 = valid_ratios.iter().map(|&mu| (mu as f64).ln()).sum();
if sum_log_mu.abs() < 1e-12 {
return TwoNNResult {
dimension: f32::NAN,
std_error: f32::NAN,
n_samples: n,
ci_lower: f32::NAN,
ci_upper: f32::NAN,
};
}
let d = (n as f64) / sum_log_mu;
let std_error = d / (n as f64).sqrt();
let ci_lower = (d - 1.96 * std_error).max(0.0);
let ci_upper = d + 1.96 * std_error;
TwoNNResult {
dimension: d as f32,
std_error: std_error as f32,
n_samples: n,
ci_lower: ci_lower as f32,
ci_upper: ci_upper as f32,
}
}
#[derive(Debug, Clone, Copy, Default)]
pub enum LidAggregation {
#[default]
Mean,
Median,
HarmonicMean,
}
#[must_use]
pub fn aggregate_lid(estimates: &[LidEstimate], method: LidAggregation) -> f32 {
let valid: Vec<f32> = estimates
.iter()
.map(|e| e.lid)
.filter(|&lid| lid.is_finite() && lid > 0.0)
.collect();
if valid.is_empty() {
return f32::NAN;
}
match method {
LidAggregation::Mean => valid.iter().sum::<f32>() / valid.len() as f32,
LidAggregation::Median => {
let mut sorted = valid.clone();
sorted.sort_unstable_by(|a, b| a.total_cmp(b));
let n = sorted.len();
if n % 2 == 0 {
(sorted[n / 2 - 1] + sorted[n / 2]) / 2.0
} else {
sorted[n / 2]
}
}
LidAggregation::HarmonicMean => {
let sum_inv: f32 = valid.iter().map(|&x| 1.0 / x).sum();
valid.len() as f32 / sum_inv
}
}
}
pub fn estimate_lid_batch(
vectors: &[f32],
dimension: usize,
config: &LidConfig,
) -> Vec<LidEstimate> {
let n = vectors.len() / dimension;
let mut results = Vec::with_capacity(n);
for i in 0..n {
let query = &vectors[i * dimension..(i + 1) * dimension];
let mut distances: Vec<f32> = (0..n)
.filter(|&j| j != i)
.map(|j| {
let other = &vectors[j * dimension..(j + 1) * dimension];
distance::l2_distance(query, other)
})
.collect();
distances.sort_unstable_by(|a, b| a.total_cmp(b));
results.push(estimate_lid_mle(&distances, config));
}
results
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum LidCategory {
Low,
Normal,
High,
}
#[derive(Debug, Clone)]
pub struct LidStats {
pub mean: f32,
pub median: f32,
pub std_dev: f32,
pub min: f32,
pub max: f32,
pub count: usize,
}
impl LidStats {
pub fn from_estimates(estimates: &[LidEstimate]) -> Self {
let valid: Vec<f32> = estimates
.iter()
.map(|e| e.lid)
.filter(|lid| lid.is_finite())
.collect();
if valid.is_empty() {
return Self {
mean: f32::NAN,
median: f32::NAN,
std_dev: f32::NAN,
min: f32::NAN,
max: f32::NAN,
count: 0,
};
}
let count = valid.len();
let mean = valid.iter().sum::<f32>() / count as f32;
let variance = valid.iter().map(|&x| (x - mean).powi(2)).sum::<f32>() / count as f32;
let std_dev = variance.sqrt();
let mut sorted = valid.clone();
sorted.sort_unstable_by(|a, b| a.total_cmp(b));
let median = if count % 2 == 0 {
(sorted[count / 2 - 1] + sorted[count / 2]) / 2.0
} else {
sorted[count / 2]
};
Self {
mean,
median,
std_dev,
min: sorted[0],
max: sorted[count - 1],
count,
}
}
pub fn categorize(&self, lid: f32) -> LidCategory {
if !lid.is_finite() {
return LidCategory::High; }
if lid < self.median - self.std_dev {
LidCategory::Low
} else if lid > self.median + self.std_dev {
LidCategory::High
} else {
LidCategory::Normal
}
}
pub fn high_lid_threshold(&self) -> f32 {
self.median + self.std_dev
}
}
pub fn estimate_lid_for_hnsw(neighbor_distances: &[f32], k: usize) -> LidEstimate {
let config = LidConfig { k, epsilon: 1e-10 };
estimate_lid(neighbor_distances, &config)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_lid_uniform_distances() {
let distances: Vec<f32> = (1..=20).map(|i| i as f32 * 0.1).collect();
let estimate = estimate_lid_mle(&distances, &LidConfig::default());
assert!(estimate.lid.is_finite());
assert!(estimate.lid > 0.0);
println!("Linear distances LID: {}", estimate.lid);
}
#[test]
fn test_lid_exponential_distances() {
let distances: Vec<f32> = (1..=20).map(|i| (i as f32).exp() * 0.01).collect();
let estimate = estimate_lid_mle(&distances, &LidConfig::default());
assert!(estimate.lid.is_finite());
println!("Exponential distances LID: {}", estimate.lid);
}
#[test]
fn test_lid_equal_distances() {
let distances = vec![1.0f32; 20];
let estimate = estimate_lid_mle(&distances, &LidConfig::default());
assert!(estimate.lid.is_infinite() || estimate.lid > 100.0);
}
#[test]
fn test_lid_stats() {
let estimates = vec![
LidEstimate {
lid: 5.0,
k: 20,
max_dist: 1.0,
},
LidEstimate {
lid: 10.0,
k: 20,
max_dist: 1.5,
},
LidEstimate {
lid: 8.0,
k: 20,
max_dist: 1.2,
},
LidEstimate {
lid: 7.0,
k: 20,
max_dist: 1.1,
},
LidEstimate {
lid: 15.0,
k: 20,
max_dist: 2.0,
}, ];
let stats = LidStats::from_estimates(&estimates);
assert_eq!(stats.count, 5);
assert!(stats.mean > 0.0);
assert_eq!(stats.min, 5.0);
assert_eq!(stats.max, 15.0);
assert_eq!(stats.categorize(15.0), LidCategory::High);
assert_eq!(stats.categorize(8.0), LidCategory::Normal);
}
#[test]
fn test_lid_batch() {
let dim = 2;
let vectors: Vec<f32> = vec![
0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 10.0, 10.0, ];
let config = LidConfig {
k: 3,
epsilon: 1e-10,
};
let estimates = estimate_lid_batch(&vectors, dim, &config);
assert_eq!(estimates.len(), 5);
let stats = LidStats::from_estimates(&estimates);
println!(
"LID stats: mean={}, median={}, std={}",
stats.mean, stats.median, stats.std_dev
);
}
#[test]
fn test_lid_config_k() {
let distances: Vec<f32> = (1..=50).map(|i| i as f32 * 0.1).collect();
let config_small = LidConfig {
k: 5,
epsilon: 1e-10,
};
let config_large = LidConfig {
k: 30,
epsilon: 1e-10,
};
let est_small = estimate_lid_mle(&distances, &config_small);
let est_large = estimate_lid_mle(&distances, &config_large);
assert_eq!(est_small.k, 5);
assert_eq!(est_large.k, 30);
println!("k=5 LID: {}, k=30 LID: {}", est_small.lid, est_large.lid);
}
#[test]
fn test_twonn_basic() {
let mu_ratios: Vec<f32> = (1..=100).map(|i| 1.0 + (i as f32 * 0.02)).collect();
let dim = estimate_twonn(&mu_ratios, 0.1);
println!("TwoNN estimated dimension: {}", dim);
assert!(dim.is_finite());
assert!(dim > 0.0);
}
#[test]
fn test_twonn_empty() {
let dim = estimate_twonn(&[], 0.1);
assert!(dim.is_nan());
}
#[test]
fn test_twonn_discards_outliers() {
let mut mu_ratios: Vec<f32> = (1..=90).map(|i| 1.0 + (i as f32 * 0.01)).collect();
mu_ratios.resize(mu_ratios.len() + 10, 100.0);
let dim_no_discard = estimate_twonn(&mu_ratios, 0.0);
let dim_with_discard = estimate_twonn(&mu_ratios, 0.15);
println!(
"TwoNN no discard: {}, with discard: {}",
dim_no_discard, dim_with_discard
);
assert!(dim_with_discard.is_finite());
}
#[test]
fn test_twonn_with_ci() {
let mu_ratios: Vec<f32> = (1..=200).map(|i| 1.0 + (i as f32 * 0.015)).collect();
let result = estimate_twonn_with_ci(&mu_ratios, 0.1);
println!(
"TwoNN with CI: d={:.2}, SE={:.3}, 95% CI=[{:.2}, {:.2}], n={}",
result.dimension, result.std_error, result.ci_lower, result.ci_upper, result.n_samples
);
assert!(result.dimension.is_finite());
assert!(result.std_error.is_finite());
assert!(result.std_error > 0.0);
assert!(result.ci_lower <= result.dimension);
assert!(result.dimension <= result.ci_upper);
assert!(result.n_samples > 0);
}
#[test]
fn test_twonn_equidistant_neighbors() {
let mu_ratios: Vec<f32> = vec![1.0; 100];
let dim = estimate_twonn(&mu_ratios, 0.1);
let result = estimate_twonn_with_ci(&mu_ratios, 0.1);
assert!(dim.is_nan());
assert!(result.dimension.is_nan());
}
#[test]
fn test_twonn_mixed_equidistant() {
let mut mu_ratios: Vec<f32> = vec![1.0; 50];
mu_ratios.extend((1..=50).map(|i| 1.0 + i as f32 * 0.02));
let dim = estimate_twonn(&mu_ratios, 0.1);
let result = estimate_twonn_with_ci(&mu_ratios, 0.1);
assert!(dim.is_finite());
assert!(result.dimension.is_finite());
}
#[test]
fn test_aggregation_methods() {
let estimates = vec![
LidEstimate {
lid: 5.0,
k: 20,
max_dist: 1.0,
},
LidEstimate {
lid: 10.0,
k: 20,
max_dist: 1.5,
},
LidEstimate {
lid: 8.0,
k: 20,
max_dist: 1.2,
},
LidEstimate {
lid: 7.0,
k: 20,
max_dist: 1.1,
},
LidEstimate {
lid: 50.0,
k: 20,
max_dist: 5.0,
}, ];
let mean = aggregate_lid(&estimates, LidAggregation::Mean);
let median = aggregate_lid(&estimates, LidAggregation::Median);
let harmonic = aggregate_lid(&estimates, LidAggregation::HarmonicMean);
println!("Mean: {}, Median: {}, Harmonic: {}", mean, median, harmonic);
assert!(mean > median);
assert!(harmonic < mean);
assert!((median - 8.0).abs() < 0.1);
}
#[test]
fn test_aggregation_empty() {
let result = aggregate_lid(&[], LidAggregation::Mean);
assert!(result.is_nan());
}
#[test]
fn test_aggregation_handles_infinities() {
let estimates = vec![
LidEstimate {
lid: 5.0,
k: 20,
max_dist: 1.0,
},
LidEstimate {
lid: f32::INFINITY,
k: 20,
max_dist: 1.5,
},
LidEstimate {
lid: 7.0,
k: 20,
max_dist: 1.2,
},
];
let mean = aggregate_lid(&estimates, LidAggregation::Mean);
assert!(mean.is_finite());
assert_eq!(mean, 6.0);
}
}