Skip to main content

lawkit_core/laws/zipf/
result.rs

1use crate::{
2    common::risk::RiskLevel,
3    error::{BenfError, Result},
4};
5
6/// ジップの法則(Zipf's law)解析結果
7#[derive(Debug, Clone)]
8pub struct ZipfResult {
9    pub dataset_name: String,
10    pub numbers_analyzed: usize,
11    pub risk_level: RiskLevel,
12
13    // Zipf分析メトリクス
14    pub zipf_exponent: f64,           // Zipf指数(理論値は1.0)
15    pub correlation_coefficient: f64, // 相関係数(適合度)
16    pub distribution_quality: f64,    // 分布品質スコア
17
18    // 頻度分析
19    pub total_observations: usize,               // 総観測数
20    pub unique_items: usize,                     // ユニーク項目数
21    pub top_item_frequency: f64,                 // 最頻項目の出現率
22    pub rank_frequency_pairs: Vec<(usize, f64)>, // ランク-頻度ペア(上位20項目)
23
24    // 分布特性
25    pub concentration_index: f64, // 集中度指数
26    pub diversity_index: f64,     // 多様性指数(Shannon entropy)
27    pub power_law_fit: f64,       // べき乗法則適合度
28}
29
30impl ZipfResult {
31    pub fn new(dataset_name: String, frequencies: &[f64]) -> Result<Self> {
32        if frequencies.len() < 5 {
33            return Err(BenfError::InsufficientData(frequencies.len()));
34        }
35
36        // データの基本統計
37        let numbers_analyzed = frequencies.len();
38        let total_observations: usize = frequencies.iter().sum::<f64>() as usize;
39        let unique_items = frequencies.len();
40        let top_item_frequency = frequencies.first().copied().unwrap_or(0.0);
41
42        // ランク-頻度ペアを生成(上位20項目または全データ)
43        let rank_frequency_pairs: Vec<(usize, f64)> = frequencies
44            .iter()
45            .enumerate()
46            .take(20.min(frequencies.len()))
47            .map(|(rank, &freq)| (rank + 1, freq))
48            .collect();
49
50        // Zipf指数計算(対数線形回帰)
51        let zipf_exponent = calculate_zipf_exponent(frequencies);
52
53        // 相関係数計算(理論Zipf分布との適合度)
54        let correlation_coefficient = calculate_correlation_with_theoretical_zipf(frequencies);
55
56        // 分布品質スコア(Zipf法則への適合度)
57        let distribution_quality =
58            calculate_distribution_quality(zipf_exponent, correlation_coefficient);
59
60        // 集中度指数(Gini係数)
61        let concentration_index = calculate_concentration_index(frequencies);
62
63        // 多様性指数(Shannon entropy)
64        let diversity_index = calculate_diversity_index(frequencies);
65
66        // べき乗法則適合度
67        let power_law_fit = calculate_power_law_fit(frequencies);
68
69        // リスクレベル判定
70        let risk_level =
71            determine_risk_level(zipf_exponent, correlation_coefficient, distribution_quality);
72
73        Ok(ZipfResult {
74            dataset_name,
75            numbers_analyzed,
76            risk_level,
77            zipf_exponent,
78            correlation_coefficient,
79            distribution_quality,
80            total_observations,
81            unique_items,
82            top_item_frequency,
83            rank_frequency_pairs,
84            concentration_index,
85            diversity_index,
86            power_law_fit,
87        })
88    }
89}
90
91/// Zipf指数を計算(対数線形回帰)
92fn calculate_zipf_exponent(frequencies: &[f64]) -> f64 {
93    if frequencies.len() < 2 {
94        return 0.0;
95    }
96
97    // 対数変換(log(rank) vs log(frequency))
98    let mut log_ranks = Vec::new();
99    let mut log_freqs = Vec::new();
100
101    for (i, &freq) in frequencies.iter().enumerate() {
102        if freq > 0.0 {
103            log_ranks.push(((i + 1) as f64).ln());
104            log_freqs.push(freq.ln());
105        }
106    }
107
108    if log_ranks.len() < 2 {
109        return 0.0;
110    }
111
112    // 線形回帰でスロープを計算
113    let n = log_ranks.len() as f64;
114    let sum_x: f64 = log_ranks.iter().sum();
115    let sum_y: f64 = log_freqs.iter().sum();
116    let sum_xy: f64 = log_ranks
117        .iter()
118        .zip(log_freqs.iter())
119        .map(|(x, y)| x * y)
120        .sum();
121    let sum_x2: f64 = log_ranks.iter().map(|x| x * x).sum();
122
123    let slope = (n * sum_xy - sum_x * sum_y) / (n * sum_x2 - sum_x * sum_x);
124
125    // Zipf指数は負の傾きの絶対値
126    -slope
127}
128
129/// 理論Zipf分布との相関係数を計算
130fn calculate_correlation_with_theoretical_zipf(frequencies: &[f64]) -> f64 {
131    if frequencies.is_empty() {
132        return 0.0;
133    }
134
135    let total: f64 = frequencies.iter().sum();
136    if total == 0.0 {
137        return 0.0;
138    }
139
140    // 理論Zipf分布を生成(1/rank)
141    let theoretical: Vec<f64> = (1..=frequencies.len())
142        .map(|rank| 1.0 / rank as f64)
143        .collect();
144
145    // 正規化
146    let theoretical_sum: f64 = theoretical.iter().sum();
147    let theoretical_normalized: Vec<f64> =
148        theoretical.iter().map(|&x| x / theoretical_sum).collect();
149
150    let observed_normalized: Vec<f64> = frequencies.iter().map(|&x| x / total).collect();
151
152    // ピアソン相関係数を計算
153    calculate_pearson_correlation(&observed_normalized, &theoretical_normalized)
154}
155
156/// ピアソン相関係数を計算
157fn calculate_pearson_correlation(x: &[f64], y: &[f64]) -> f64 {
158    if x.len() != y.len() || x.is_empty() {
159        return 0.0;
160    }
161
162    let n = x.len() as f64;
163    let mean_x: f64 = x.iter().sum::<f64>() / n;
164    let mean_y: f64 = y.iter().sum::<f64>() / n;
165
166    let mut sum_xy = 0.0;
167    let mut sum_x2 = 0.0;
168    let mut sum_y2 = 0.0;
169
170    for (&xi, &yi) in x.iter().zip(y.iter()) {
171        let dx = xi - mean_x;
172        let dy = yi - mean_y;
173        sum_xy += dx * dy;
174        sum_x2 += dx * dx;
175        sum_y2 += dy * dy;
176    }
177
178    let denominator = (sum_x2 * sum_y2).sqrt();
179    if denominator == 0.0 {
180        0.0
181    } else {
182        sum_xy / denominator
183    }
184}
185
186/// 分布品質スコアを計算
187fn calculate_distribution_quality(zipf_exponent: f64, correlation: f64) -> f64 {
188    // 理想的なZipf指数は1.0、相関係数は1.0に近いほど良い
189    let exponent_score = 1.0 - (zipf_exponent - 1.0).abs().min(1.0);
190    let correlation_score = correlation.abs();
191
192    // 重み付き平均
193    (exponent_score * 0.6 + correlation_score * 0.4).clamp(0.0, 1.0)
194}
195
196/// 集中度指数(Gini係数)を計算
197fn calculate_concentration_index(frequencies: &[f64]) -> f64 {
198    if frequencies.len() < 2 {
199        return 0.0;
200    }
201
202    let mut sorted_freqs = frequencies.to_vec();
203    sorted_freqs.sort_by(|a, b| a.partial_cmp(b).unwrap());
204
205    let n = sorted_freqs.len() as f64;
206    let sum: f64 = sorted_freqs.iter().sum();
207
208    if sum == 0.0 {
209        return 0.0;
210    }
211
212    let mut gini = 0.0;
213    for (i, &freq) in sorted_freqs.iter().enumerate() {
214        gini += freq * (2.0 * (i + 1) as f64 - n - 1.0);
215    }
216
217    gini / (n * sum)
218}
219
220/// 多様性指数(Shannon entropy)を計算
221fn calculate_diversity_index(frequencies: &[f64]) -> f64 {
222    let total: f64 = frequencies.iter().sum();
223    if total == 0.0 {
224        return 0.0;
225    }
226
227    let mut entropy = 0.0;
228    for &freq in frequencies {
229        if freq > 0.0 {
230            let p = freq / total;
231            entropy -= p * p.ln();
232        }
233    }
234
235    entropy
236}
237
238/// べき乗法則適合度を計算
239fn calculate_power_law_fit(frequencies: &[f64]) -> f64 {
240    if frequencies.len() < 3 {
241        return 0.0;
242    }
243
244    // Kolmogorov-Smirnov統計量の簡易版
245    let theoretical_zipf = calculate_zipf_exponent(frequencies);
246    if theoretical_zipf == 0.0 {
247        return 0.0;
248    }
249
250    // 理論分布と観測分布の最大差を計算
251    let total: f64 = frequencies.iter().sum();
252    let mut max_diff: f64 = 0.0;
253    let mut cumulative_observed = 0.0;
254    let mut cumulative_theoretical = 0.0;
255
256    for (i, &freq) in frequencies.iter().enumerate() {
257        cumulative_observed += freq / total;
258
259        let theoretical_freq = 1.0 / ((i + 1) as f64).powf(theoretical_zipf);
260        cumulative_theoretical += theoretical_freq;
261
262        let diff = (cumulative_observed - cumulative_theoretical).abs();
263        max_diff = max_diff.max(diff);
264    }
265
266    // 適合度スコア(1.0 - KS統計量)
267    (1.0 - max_diff).max(0.0)
268}
269
270/// リスクレベルを判定
271fn determine_risk_level(zipf_exponent: f64, correlation: f64, quality: f64) -> RiskLevel {
272    // Zipf法則への適合度に基づいてリスクレベルを判定
273    if quality >= 0.8 && (zipf_exponent - 1.0).abs() < 0.2 && correlation > 0.8 {
274        RiskLevel::Low // 理想的なZipf分布
275    } else if quality >= 0.6 && (zipf_exponent - 1.0).abs() < 0.4 && correlation > 0.6 {
276        RiskLevel::Medium // 軽微な偏差
277    } else if quality >= 0.4 && (zipf_exponent - 1.0).abs() < 0.7 {
278        RiskLevel::High // 有意な偏差
279    } else {
280        RiskLevel::Critical // 重大な偏差
281    }
282}
283
284#[cfg(test)]
285mod tests {
286    use super::*;
287
288    #[test]
289    fn test_perfect_zipf_distribution() {
290        // 理想的なZipf分布(1/rank)
291        let frequencies: Vec<f64> = (1..=20).map(|rank| 1000.0 / rank as f64).collect();
292
293        let result = ZipfResult::new("test".to_string(), &frequencies).unwrap();
294
295        assert_eq!(result.numbers_analyzed, 20);
296        assert!((result.zipf_exponent - 1.0).abs() < 0.1); // 理論値1.0に近い
297        assert!(result.correlation_coefficient > 0.9); // 高い相関
298        assert!(matches!(result.risk_level, RiskLevel::Low));
299    }
300
301    #[test]
302    fn test_uniform_distribution() {
303        // 均等分布(Zipf法則に合わない)
304        let frequencies = vec![100.0; 20];
305        let result = ZipfResult::new("uniform".to_string(), &frequencies).unwrap();
306
307        assert_eq!(result.numbers_analyzed, 20);
308        assert!(result.zipf_exponent < 0.5 || result.zipf_exponent > 2.0); // 理論値から大きく外れる
309        assert!(matches!(result.risk_level, RiskLevel::Critical));
310    }
311
312    #[test]
313    fn test_insufficient_data() {
314        let frequencies = vec![1.0, 2.0]; // 5個未満
315        let result = ZipfResult::new("test".to_string(), &frequencies);
316
317        assert!(result.is_err());
318    }
319}