use crate::{
common::risk::RiskLevel,
error::{BenfError, Result},
};
#[derive(Debug, Clone)]
pub struct ZipfResult {
pub dataset_name: String,
pub numbers_analyzed: usize,
pub risk_level: RiskLevel,
pub zipf_exponent: f64, pub correlation_coefficient: f64, pub distribution_quality: f64,
pub total_observations: usize, pub unique_items: usize, pub top_item_frequency: f64, pub rank_frequency_pairs: Vec<(usize, f64)>,
pub concentration_index: f64, pub diversity_index: f64, pub power_law_fit: f64, }
impl ZipfResult {
pub fn new(dataset_name: String, frequencies: &[f64]) -> Result<Self> {
if frequencies.len() < 5 {
return Err(BenfError::InsufficientData(frequencies.len()));
}
let numbers_analyzed = frequencies.len();
let total_observations: usize = frequencies.iter().sum::<f64>() as usize;
let unique_items = frequencies.len();
let top_item_frequency = frequencies.first().copied().unwrap_or(0.0);
let rank_frequency_pairs: Vec<(usize, f64)> = frequencies
.iter()
.enumerate()
.take(20.min(frequencies.len()))
.map(|(rank, &freq)| (rank + 1, freq))
.collect();
let zipf_exponent = calculate_zipf_exponent(frequencies);
let correlation_coefficient = calculate_correlation_with_theoretical_zipf(frequencies);
let distribution_quality =
calculate_distribution_quality(zipf_exponent, correlation_coefficient);
let concentration_index = calculate_concentration_index(frequencies);
let diversity_index = calculate_diversity_index(frequencies);
let power_law_fit = calculate_power_law_fit(frequencies);
let risk_level =
determine_risk_level(zipf_exponent, correlation_coefficient, distribution_quality);
Ok(ZipfResult {
dataset_name,
numbers_analyzed,
risk_level,
zipf_exponent,
correlation_coefficient,
distribution_quality,
total_observations,
unique_items,
top_item_frequency,
rank_frequency_pairs,
concentration_index,
diversity_index,
power_law_fit,
})
}
}
fn calculate_zipf_exponent(frequencies: &[f64]) -> f64 {
if frequencies.len() < 2 {
return 0.0;
}
let mut log_ranks = Vec::new();
let mut log_freqs = Vec::new();
for (i, &freq) in frequencies.iter().enumerate() {
if freq > 0.0 {
log_ranks.push(((i + 1) as f64).ln());
log_freqs.push(freq.ln());
}
}
if log_ranks.len() < 2 {
return 0.0;
}
let n = log_ranks.len() as f64;
let sum_x: f64 = log_ranks.iter().sum();
let sum_y: f64 = log_freqs.iter().sum();
let sum_xy: f64 = log_ranks
.iter()
.zip(log_freqs.iter())
.map(|(x, y)| x * y)
.sum();
let sum_x2: f64 = log_ranks.iter().map(|x| x * x).sum();
let slope = (n * sum_xy - sum_x * sum_y) / (n * sum_x2 - sum_x * sum_x);
-slope
}
fn calculate_correlation_with_theoretical_zipf(frequencies: &[f64]) -> f64 {
if frequencies.is_empty() {
return 0.0;
}
let total: f64 = frequencies.iter().sum();
if total == 0.0 {
return 0.0;
}
let theoretical: Vec<f64> = (1..=frequencies.len())
.map(|rank| 1.0 / rank as f64)
.collect();
let theoretical_sum: f64 = theoretical.iter().sum();
let theoretical_normalized: Vec<f64> =
theoretical.iter().map(|&x| x / theoretical_sum).collect();
let observed_normalized: Vec<f64> = frequencies.iter().map(|&x| x / total).collect();
calculate_pearson_correlation(&observed_normalized, &theoretical_normalized)
}
fn calculate_pearson_correlation(x: &[f64], y: &[f64]) -> f64 {
if x.len() != y.len() || x.is_empty() {
return 0.0;
}
let n = x.len() as f64;
let mean_x: f64 = x.iter().sum::<f64>() / n;
let mean_y: f64 = y.iter().sum::<f64>() / n;
let mut sum_xy = 0.0;
let mut sum_x2 = 0.0;
let mut sum_y2 = 0.0;
for (&xi, &yi) in x.iter().zip(y.iter()) {
let dx = xi - mean_x;
let dy = yi - mean_y;
sum_xy += dx * dy;
sum_x2 += dx * dx;
sum_y2 += dy * dy;
}
let denominator = (sum_x2 * sum_y2).sqrt();
if denominator == 0.0 {
0.0
} else {
sum_xy / denominator
}
}
fn calculate_distribution_quality(zipf_exponent: f64, correlation: f64) -> f64 {
let exponent_score = 1.0 - (zipf_exponent - 1.0).abs().min(1.0);
let correlation_score = correlation.abs();
(exponent_score * 0.6 + correlation_score * 0.4).clamp(0.0, 1.0)
}
fn calculate_concentration_index(frequencies: &[f64]) -> f64 {
if frequencies.len() < 2 {
return 0.0;
}
let mut sorted_freqs = frequencies.to_vec();
sorted_freqs.sort_by(|a, b| a.partial_cmp(b).unwrap());
let n = sorted_freqs.len() as f64;
let sum: f64 = sorted_freqs.iter().sum();
if sum == 0.0 {
return 0.0;
}
let mut gini = 0.0;
for (i, &freq) in sorted_freqs.iter().enumerate() {
gini += freq * (2.0 * (i + 1) as f64 - n - 1.0);
}
gini / (n * sum)
}
fn calculate_diversity_index(frequencies: &[f64]) -> f64 {
let total: f64 = frequencies.iter().sum();
if total == 0.0 {
return 0.0;
}
let mut entropy = 0.0;
for &freq in frequencies {
if freq > 0.0 {
let p = freq / total;
entropy -= p * p.ln();
}
}
entropy
}
fn calculate_power_law_fit(frequencies: &[f64]) -> f64 {
if frequencies.len() < 3 {
return 0.0;
}
let theoretical_zipf = calculate_zipf_exponent(frequencies);
if theoretical_zipf == 0.0 {
return 0.0;
}
let total: f64 = frequencies.iter().sum();
let mut max_diff: f64 = 0.0;
let mut cumulative_observed = 0.0;
let mut cumulative_theoretical = 0.0;
for (i, &freq) in frequencies.iter().enumerate() {
cumulative_observed += freq / total;
let theoretical_freq = 1.0 / ((i + 1) as f64).powf(theoretical_zipf);
cumulative_theoretical += theoretical_freq;
let diff = (cumulative_observed - cumulative_theoretical).abs();
max_diff = max_diff.max(diff);
}
(1.0 - max_diff).max(0.0)
}
fn determine_risk_level(zipf_exponent: f64, correlation: f64, quality: f64) -> RiskLevel {
if quality >= 0.8 && (zipf_exponent - 1.0).abs() < 0.2 && correlation > 0.8 {
RiskLevel::Low } else if quality >= 0.6 && (zipf_exponent - 1.0).abs() < 0.4 && correlation > 0.6 {
RiskLevel::Medium } else if quality >= 0.4 && (zipf_exponent - 1.0).abs() < 0.7 {
RiskLevel::High } else {
RiskLevel::Critical }
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_perfect_zipf_distribution() {
let frequencies: Vec<f64> = (1..=20).map(|rank| 1000.0 / rank as f64).collect();
let result = ZipfResult::new("test".to_string(), &frequencies).unwrap();
assert_eq!(result.numbers_analyzed, 20);
assert!((result.zipf_exponent - 1.0).abs() < 0.1); assert!(result.correlation_coefficient > 0.9); assert!(matches!(result.risk_level, RiskLevel::Low));
}
#[test]
fn test_uniform_distribution() {
let frequencies = vec![100.0; 20];
let result = ZipfResult::new("uniform".to_string(), &frequencies).unwrap();
assert_eq!(result.numbers_analyzed, 20);
assert!(result.zipf_exponent < 0.5 || result.zipf_exponent > 2.0); assert!(matches!(result.risk_level, RiskLevel::Critical));
}
#[test]
fn test_insufficient_data() {
let frequencies = vec![1.0, 2.0]; let result = ZipfResult::new("test".to_string(), &frequencies);
assert!(result.is_err());
}
}