Skip to main content

lawkit_core/laws/zipf/
analysis.rs

1use super::result::ZipfResult;
2use crate::error::Result;
3use std::collections::HashMap;
4
5/// ジップの法則(Zipf's law)の分析を実行
6pub fn analyze_zipf_distribution(frequencies: &[f64], dataset_name: &str) -> Result<ZipfResult> {
7    ZipfResult::new(dataset_name.to_string(), frequencies)
8}
9
10/// テキストデータからZipf分析を実行
11pub fn analyze_text_zipf(text: &str, dataset_name: &str) -> Result<ZipfResult> {
12    let word_frequencies = extract_word_frequencies(text);
13    let frequencies: Vec<f64> = word_frequencies
14        .into_iter()
15        .map(|(_, freq)| freq as f64)
16        .collect();
17    analyze_zipf_distribution(&frequencies, dataset_name)
18}
19
20/// 頻度データからZipf分析を実行
21pub fn analyze_text_zipf_from_frequencies(
22    frequencies: &[(String, usize)],
23    dataset_name: &str,
24) -> Result<ZipfResult> {
25    let freq_values: Vec<f64> = frequencies.iter().map(|(_, freq)| *freq as f64).collect();
26    analyze_zipf_distribution(&freq_values, dataset_name)
27}
28
29/// テキストから単語頻度を抽出
30fn extract_word_frequencies(text: &str) -> Vec<(String, usize)> {
31    let mut word_counts = HashMap::new();
32
33    // 単語分割(日本語・英語・中国語対応)
34    let words = tokenize_multilingual_text(text);
35
36    for word in words {
37        if !word.is_empty() && word.len() > 1 {
38            *word_counts.entry(word.to_lowercase()).or_insert(0) += 1;
39        }
40    }
41
42    // 頻度順にソート
43    let mut frequencies: Vec<(String, usize)> = word_counts.into_iter().collect();
44    frequencies.sort_by(|a, b| b.1.cmp(&a.1));
45
46    frequencies
47}
48
49/// 多言語テキストのトークン化
50fn tokenize_multilingual_text(text: &str) -> Vec<String> {
51    let mut tokens = Vec::new();
52    let mut current_token = String::new();
53
54    for ch in text.chars() {
55        match ch {
56            // 英語・数字の処理
57            'a'..='z' | 'A'..='Z' | '0'..='9' => {
58                current_token.push(ch);
59            }
60            // 日本語文字の処理
61            '\u{3040}'..='\u{309F}' |  // ひらがな
62            '\u{30A0}'..='\u{30FF}' |  // カタカナ
63            '\u{4E00}'..='\u{9FAF}' => { // 漢字
64                if !current_token.is_empty() {
65                    tokens.push(current_token.clone());
66                    current_token.clear();
67                }
68                tokens.push(ch.to_string());
69            }
70            // 区切り文字
71            ' ' | '\t' | '\n' | '\r' | ',' | '.' | '!' | '?' | ';' | ':' |
72            '"' | '\'' | '(' | ')' | '[' | ']' | '{' | '}' | '/' | '\\' |
73            '|' | '@' | '#' | '$' | '%' | '^' | '&' | '*' | '+' | '=' |
74            '<' | '>' | '~' | '`' => {
75                if !current_token.is_empty() {
76                    tokens.push(current_token.clone());
77                    current_token.clear();
78                }
79            }
80            _ => {
81                current_token.push(ch);
82            }
83        }
84    }
85
86    if !current_token.is_empty() {
87        tokens.push(current_token);
88    }
89
90    tokens
91}
92
93/// 数値データからZipf分析(頻度分布として扱う)
94pub fn analyze_numeric_zipf(numbers: &[f64], dataset_name: &str) -> Result<ZipfResult> {
95    // 数値を頻度として扱い、降順にソート
96    let mut frequencies = numbers.to_vec();
97    frequencies.sort_by(|a, b| b.partial_cmp(a).unwrap());
98
99    // 負の値を除去
100    frequencies.retain(|&x| x > 0.0);
101
102    analyze_zipf_distribution(&frequencies, dataset_name)
103}
104
105/// 複数データセットの統合Zipf分析
106pub fn analyze_combined_zipf(datasets: &[(&str, &[f64])]) -> Result<Vec<ZipfResult>> {
107    let mut results = Vec::new();
108
109    for (name, data) in datasets {
110        let result = analyze_zipf_distribution(data, name)?;
111        results.push(result);
112    }
113
114    Ok(results)
115}
116
117/// Zipf分布の品質評価
118pub fn evaluate_zipf_quality(zipf_result: &ZipfResult) -> ZipfQualityReport {
119    let mut quality_metrics = Vec::new();
120
121    // 指数の理想値からの偏差
122    let exponent_score = calculate_exponent_score(zipf_result.zipf_exponent);
123    quality_metrics.push(QualityMetric {
124        name: "Exponent Quality".to_string(),
125        score: exponent_score,
126        description: format!("指数値: {:.3} (理想値: 1.0)", zipf_result.zipf_exponent),
127    });
128
129    // 相関係数の評価
130    let correlation_score = zipf_result.correlation_coefficient;
131    quality_metrics.push(QualityMetric {
132        name: "Correlation".to_string(),
133        score: correlation_score,
134        description: format!("相関係数: {correlation_score:.3}"),
135    });
136
137    // 全体品質スコア
138    let overall_score = (exponent_score + correlation_score) / 2.0;
139
140    ZipfQualityReport {
141        overall_score,
142        quality_metrics,
143        compliance_level: determine_compliance_level(overall_score),
144    }
145}
146
147/// 指数品質スコアを計算
148fn calculate_exponent_score(exponent: f64) -> f64 {
149    // 理想的なZipf指数は1.0
150    let deviation = (exponent - 1.0).abs();
151
152    // 偏差に基づくスコア計算
153    if deviation <= 0.1 {
154        1.0
155    } else if deviation <= 0.3 {
156        0.8
157    } else if deviation <= 0.5 {
158        0.6
159    } else if deviation <= 0.7 {
160        0.4
161    } else if deviation <= 1.0 {
162        0.2
163    } else {
164        0.0
165    }
166}
167
168/// 遵守レベルを判定
169fn determine_compliance_level(score: f64) -> String {
170    if score >= 0.8 {
171        "Excellent".to_string()
172    } else if score >= 0.6 {
173        "Good".to_string()
174    } else if score >= 0.4 {
175        "Fair".to_string()
176    } else if score >= 0.2 {
177        "Poor".to_string()
178    } else {
179        "Very Poor".to_string()
180    }
181}
182
183/// 品質メトリック
184#[derive(Debug, Clone)]
185pub struct QualityMetric {
186    pub name: String,
187    pub score: f64,
188    pub description: String,
189}
190
191/// Zipf品質レポート
192#[derive(Debug, Clone)]
193pub struct ZipfQualityReport {
194    pub overall_score: f64,
195    pub quality_metrics: Vec<QualityMetric>,
196    pub compliance_level: String,
197}
198
199#[cfg(test)]
200mod tests {
201    use super::*;
202
203    #[test]
204    fn test_text_zipf_analysis() {
205        let text = "the quick brown fox jumps over the lazy dog the fox is quick";
206        let result = analyze_text_zipf(text, "sample_text").unwrap();
207
208        assert!(result.numbers_analyzed > 0);
209        assert!(result.unique_items > 0);
210        assert!(result.total_observations > 0);
211    }
212
213    #[test]
214    fn test_numeric_zipf_analysis() {
215        let numbers = vec![
216            1000.0, 500.0, 333.0, 250.0, 200.0, 166.0, 142.0, 125.0, 111.0, 100.0,
217        ];
218        let result = analyze_numeric_zipf(&numbers, "numeric_test").unwrap();
219
220        assert_eq!(result.numbers_analyzed, 10);
221        assert!(result.zipf_exponent > 0.0);
222    }
223
224    #[test]
225    fn test_multilingual_tokenization() {
226        let text = "Hello 世界 测试 مرحبا";
227        let tokens = tokenize_multilingual_text(text);
228
229        assert!(!tokens.is_empty());
230        assert!(tokens.contains(&"Hello".to_string()));
231        assert!(tokens.contains(&"世".to_string()));
232        assert!(tokens.contains(&"界".to_string()));
233    }
234
235    #[test]
236    fn test_zipf_quality_evaluation() {
237        let frequencies = vec![100.0, 50.0, 33.0, 25.0, 20.0];
238        let result = analyze_zipf_distribution(&frequencies, "test").unwrap();
239        let quality_report = evaluate_zipf_quality(&result);
240
241        assert!(quality_report.overall_score >= 0.0);
242        assert!(quality_report.overall_score <= 1.0);
243    }
244
245    #[test]
246    fn test_combined_zipf_analysis() {
247        let dataset1 = vec![100.0, 50.0, 33.0, 25.0, 20.0];
248        let dataset2 = vec![200.0, 100.0, 66.0, 50.0, 40.0];
249        let datasets = vec![
250            ("dataset1", dataset1.as_slice()),
251            ("dataset2", dataset2.as_slice()),
252        ];
253
254        let results = analyze_combined_zipf(&datasets).unwrap();
255        assert_eq!(results.len(), 2);
256    }
257}