lawkit_core/laws/zipf/
result.rs1use crate::{
2 common::risk::RiskLevel,
3 error::{BenfError, Result},
4};
5
6#[derive(Debug, Clone)]
8pub struct ZipfResult {
9 pub dataset_name: String,
10 pub numbers_analyzed: usize,
11 pub risk_level: RiskLevel,
12
13 pub zipf_exponent: f64, pub correlation_coefficient: f64, pub distribution_quality: f64, pub total_observations: usize, pub unique_items: usize, pub top_item_frequency: f64, pub rank_frequency_pairs: Vec<(usize, f64)>, pub concentration_index: f64, pub diversity_index: f64, pub power_law_fit: f64, }
29
30impl ZipfResult {
31 pub fn new(dataset_name: String, frequencies: &[f64]) -> Result<Self> {
32 if frequencies.len() < 5 {
33 return Err(BenfError::InsufficientData(frequencies.len()));
34 }
35
36 let numbers_analyzed = frequencies.len();
38 let total_observations: usize = frequencies.iter().sum::<f64>() as usize;
39 let unique_items = frequencies.len();
40 let top_item_frequency = frequencies.first().copied().unwrap_or(0.0);
41
42 let rank_frequency_pairs: Vec<(usize, f64)> = frequencies
44 .iter()
45 .enumerate()
46 .take(20.min(frequencies.len()))
47 .map(|(rank, &freq)| (rank + 1, freq))
48 .collect();
49
50 let zipf_exponent = calculate_zipf_exponent(frequencies);
52
53 let correlation_coefficient = calculate_correlation_with_theoretical_zipf(frequencies);
55
56 let distribution_quality =
58 calculate_distribution_quality(zipf_exponent, correlation_coefficient);
59
60 let concentration_index = calculate_concentration_index(frequencies);
62
63 let diversity_index = calculate_diversity_index(frequencies);
65
66 let power_law_fit = calculate_power_law_fit(frequencies);
68
69 let risk_level =
71 determine_risk_level(zipf_exponent, correlation_coefficient, distribution_quality);
72
73 Ok(ZipfResult {
74 dataset_name,
75 numbers_analyzed,
76 risk_level,
77 zipf_exponent,
78 correlation_coefficient,
79 distribution_quality,
80 total_observations,
81 unique_items,
82 top_item_frequency,
83 rank_frequency_pairs,
84 concentration_index,
85 diversity_index,
86 power_law_fit,
87 })
88 }
89}
90
91fn calculate_zipf_exponent(frequencies: &[f64]) -> f64 {
93 if frequencies.len() < 2 {
94 return 0.0;
95 }
96
97 let mut log_ranks = Vec::new();
99 let mut log_freqs = Vec::new();
100
101 for (i, &freq) in frequencies.iter().enumerate() {
102 if freq > 0.0 {
103 log_ranks.push(((i + 1) as f64).ln());
104 log_freqs.push(freq.ln());
105 }
106 }
107
108 if log_ranks.len() < 2 {
109 return 0.0;
110 }
111
112 let n = log_ranks.len() as f64;
114 let sum_x: f64 = log_ranks.iter().sum();
115 let sum_y: f64 = log_freqs.iter().sum();
116 let sum_xy: f64 = log_ranks
117 .iter()
118 .zip(log_freqs.iter())
119 .map(|(x, y)| x * y)
120 .sum();
121 let sum_x2: f64 = log_ranks.iter().map(|x| x * x).sum();
122
123 let slope = (n * sum_xy - sum_x * sum_y) / (n * sum_x2 - sum_x * sum_x);
124
125 -slope
127}
128
129fn calculate_correlation_with_theoretical_zipf(frequencies: &[f64]) -> f64 {
131 if frequencies.is_empty() {
132 return 0.0;
133 }
134
135 let total: f64 = frequencies.iter().sum();
136 if total == 0.0 {
137 return 0.0;
138 }
139
140 let theoretical: Vec<f64> = (1..=frequencies.len())
142 .map(|rank| 1.0 / rank as f64)
143 .collect();
144
145 let theoretical_sum: f64 = theoretical.iter().sum();
147 let theoretical_normalized: Vec<f64> =
148 theoretical.iter().map(|&x| x / theoretical_sum).collect();
149
150 let observed_normalized: Vec<f64> = frequencies.iter().map(|&x| x / total).collect();
151
152 calculate_pearson_correlation(&observed_normalized, &theoretical_normalized)
154}
155
156fn calculate_pearson_correlation(x: &[f64], y: &[f64]) -> f64 {
158 if x.len() != y.len() || x.is_empty() {
159 return 0.0;
160 }
161
162 let n = x.len() as f64;
163 let mean_x: f64 = x.iter().sum::<f64>() / n;
164 let mean_y: f64 = y.iter().sum::<f64>() / n;
165
166 let mut sum_xy = 0.0;
167 let mut sum_x2 = 0.0;
168 let mut sum_y2 = 0.0;
169
170 for (&xi, &yi) in x.iter().zip(y.iter()) {
171 let dx = xi - mean_x;
172 let dy = yi - mean_y;
173 sum_xy += dx * dy;
174 sum_x2 += dx * dx;
175 sum_y2 += dy * dy;
176 }
177
178 let denominator = (sum_x2 * sum_y2).sqrt();
179 if denominator == 0.0 {
180 0.0
181 } else {
182 sum_xy / denominator
183 }
184}
185
186fn calculate_distribution_quality(zipf_exponent: f64, correlation: f64) -> f64 {
188 let exponent_score = 1.0 - (zipf_exponent - 1.0).abs().min(1.0);
190 let correlation_score = correlation.abs();
191
192 (exponent_score * 0.6 + correlation_score * 0.4).clamp(0.0, 1.0)
194}
195
196fn calculate_concentration_index(frequencies: &[f64]) -> f64 {
198 if frequencies.len() < 2 {
199 return 0.0;
200 }
201
202 let mut sorted_freqs = frequencies.to_vec();
203 sorted_freqs.sort_by(|a, b| a.partial_cmp(b).unwrap());
204
205 let n = sorted_freqs.len() as f64;
206 let sum: f64 = sorted_freqs.iter().sum();
207
208 if sum == 0.0 {
209 return 0.0;
210 }
211
212 let mut gini = 0.0;
213 for (i, &freq) in sorted_freqs.iter().enumerate() {
214 gini += freq * (2.0 * (i + 1) as f64 - n - 1.0);
215 }
216
217 gini / (n * sum)
218}
219
220fn calculate_diversity_index(frequencies: &[f64]) -> f64 {
222 let total: f64 = frequencies.iter().sum();
223 if total == 0.0 {
224 return 0.0;
225 }
226
227 let mut entropy = 0.0;
228 for &freq in frequencies {
229 if freq > 0.0 {
230 let p = freq / total;
231 entropy -= p * p.ln();
232 }
233 }
234
235 entropy
236}
237
238fn calculate_power_law_fit(frequencies: &[f64]) -> f64 {
240 if frequencies.len() < 3 {
241 return 0.0;
242 }
243
244 let theoretical_zipf = calculate_zipf_exponent(frequencies);
246 if theoretical_zipf == 0.0 {
247 return 0.0;
248 }
249
250 let total: f64 = frequencies.iter().sum();
252 let mut max_diff: f64 = 0.0;
253 let mut cumulative_observed = 0.0;
254 let mut cumulative_theoretical = 0.0;
255
256 for (i, &freq) in frequencies.iter().enumerate() {
257 cumulative_observed += freq / total;
258
259 let theoretical_freq = 1.0 / ((i + 1) as f64).powf(theoretical_zipf);
260 cumulative_theoretical += theoretical_freq;
261
262 let diff = (cumulative_observed - cumulative_theoretical).abs();
263 max_diff = max_diff.max(diff);
264 }
265
266 (1.0 - max_diff).max(0.0)
268}
269
270fn determine_risk_level(zipf_exponent: f64, correlation: f64, quality: f64) -> RiskLevel {
272 if quality >= 0.8 && (zipf_exponent - 1.0).abs() < 0.2 && correlation > 0.8 {
274 RiskLevel::Low } else if quality >= 0.6 && (zipf_exponent - 1.0).abs() < 0.4 && correlation > 0.6 {
276 RiskLevel::Medium } else if quality >= 0.4 && (zipf_exponent - 1.0).abs() < 0.7 {
278 RiskLevel::High } else {
280 RiskLevel::Critical }
282}
283
284#[cfg(test)]
285mod tests {
286 use super::*;
287
288 #[test]
289 fn test_perfect_zipf_distribution() {
290 let frequencies: Vec<f64> = (1..=20).map(|rank| 1000.0 / rank as f64).collect();
292
293 let result = ZipfResult::new("test".to_string(), &frequencies).unwrap();
294
295 assert_eq!(result.numbers_analyzed, 20);
296 assert!((result.zipf_exponent - 1.0).abs() < 0.1); assert!(result.correlation_coefficient > 0.9); assert!(matches!(result.risk_level, RiskLevel::Low));
299 }
300
301 #[test]
302 fn test_uniform_distribution() {
303 let frequencies = vec![100.0; 20];
305 let result = ZipfResult::new("uniform".to_string(), &frequencies).unwrap();
306
307 assert_eq!(result.numbers_analyzed, 20);
308 assert!(result.zipf_exponent < 0.5 || result.zipf_exponent > 2.0); assert!(matches!(result.risk_level, RiskLevel::Critical));
310 }
311
312 #[test]
313 fn test_insufficient_data() {
314 let frequencies = vec![1.0, 2.0]; let result = ZipfResult::new("test".to_string(), &frequencies);
316
317 assert!(result.is_err());
318 }
319}