lawkit_core/laws/zipf/
analysis.rs1use super::result::ZipfResult;
2use crate::error::Result;
3use std::collections::HashMap;
4
5pub fn analyze_zipf_distribution(frequencies: &[f64], dataset_name: &str) -> Result<ZipfResult> {
7 ZipfResult::new(dataset_name.to_string(), frequencies)
8}
9
10pub fn analyze_text_zipf(text: &str, dataset_name: &str) -> Result<ZipfResult> {
12 let word_frequencies = extract_word_frequencies(text);
13 let frequencies: Vec<f64> = word_frequencies
14 .into_iter()
15 .map(|(_, freq)| freq as f64)
16 .collect();
17 analyze_zipf_distribution(&frequencies, dataset_name)
18}
19
20pub fn analyze_text_zipf_from_frequencies(
22 frequencies: &[(String, usize)],
23 dataset_name: &str,
24) -> Result<ZipfResult> {
25 let freq_values: Vec<f64> = frequencies.iter().map(|(_, freq)| *freq as f64).collect();
26 analyze_zipf_distribution(&freq_values, dataset_name)
27}
28
29fn extract_word_frequencies(text: &str) -> Vec<(String, usize)> {
31 let mut word_counts = HashMap::new();
32
33 let words = tokenize_multilingual_text(text);
35
36 for word in words {
37 if !word.is_empty() && word.len() > 1 {
38 *word_counts.entry(word.to_lowercase()).or_insert(0) += 1;
39 }
40 }
41
42 let mut frequencies: Vec<(String, usize)> = word_counts.into_iter().collect();
44 frequencies.sort_by(|a, b| b.1.cmp(&a.1));
45
46 frequencies
47}
48
49fn tokenize_multilingual_text(text: &str) -> Vec<String> {
51 let mut tokens = Vec::new();
52 let mut current_token = String::new();
53
54 for ch in text.chars() {
55 match ch {
56 'a'..='z' | 'A'..='Z' | '0'..='9' => {
58 current_token.push(ch);
59 }
60 '\u{3040}'..='\u{309F}' | '\u{30A0}'..='\u{30FF}' | '\u{4E00}'..='\u{9FAF}' => { if !current_token.is_empty() {
65 tokens.push(current_token.clone());
66 current_token.clear();
67 }
68 tokens.push(ch.to_string());
69 }
70 ' ' | '\t' | '\n' | '\r' | ',' | '.' | '!' | '?' | ';' | ':' |
72 '"' | '\'' | '(' | ')' | '[' | ']' | '{' | '}' | '/' | '\\' |
73 '|' | '@' | '#' | '$' | '%' | '^' | '&' | '*' | '+' | '=' |
74 '<' | '>' | '~' | '`' => {
75 if !current_token.is_empty() {
76 tokens.push(current_token.clone());
77 current_token.clear();
78 }
79 }
80 _ => {
81 current_token.push(ch);
82 }
83 }
84 }
85
86 if !current_token.is_empty() {
87 tokens.push(current_token);
88 }
89
90 tokens
91}
92
93pub fn analyze_numeric_zipf(numbers: &[f64], dataset_name: &str) -> Result<ZipfResult> {
95 let mut frequencies = numbers.to_vec();
97 frequencies.sort_by(|a, b| b.partial_cmp(a).unwrap());
98
99 frequencies.retain(|&x| x > 0.0);
101
102 analyze_zipf_distribution(&frequencies, dataset_name)
103}
104
105pub fn analyze_combined_zipf(datasets: &[(&str, &[f64])]) -> Result<Vec<ZipfResult>> {
107 let mut results = Vec::new();
108
109 for (name, data) in datasets {
110 let result = analyze_zipf_distribution(data, name)?;
111 results.push(result);
112 }
113
114 Ok(results)
115}
116
117pub fn evaluate_zipf_quality(zipf_result: &ZipfResult) -> ZipfQualityReport {
119 let mut quality_metrics = Vec::new();
120
121 let exponent_score = calculate_exponent_score(zipf_result.zipf_exponent);
123 quality_metrics.push(QualityMetric {
124 name: "Exponent Quality".to_string(),
125 score: exponent_score,
126 description: format!("指数値: {:.3} (理想値: 1.0)", zipf_result.zipf_exponent),
127 });
128
129 let correlation_score = zipf_result.correlation_coefficient;
131 quality_metrics.push(QualityMetric {
132 name: "Correlation".to_string(),
133 score: correlation_score,
134 description: format!("相関係数: {correlation_score:.3}"),
135 });
136
137 let overall_score = (exponent_score + correlation_score) / 2.0;
139
140 ZipfQualityReport {
141 overall_score,
142 quality_metrics,
143 compliance_level: determine_compliance_level(overall_score),
144 }
145}
146
147fn calculate_exponent_score(exponent: f64) -> f64 {
149 let deviation = (exponent - 1.0).abs();
151
152 if deviation <= 0.1 {
154 1.0
155 } else if deviation <= 0.3 {
156 0.8
157 } else if deviation <= 0.5 {
158 0.6
159 } else if deviation <= 0.7 {
160 0.4
161 } else if deviation <= 1.0 {
162 0.2
163 } else {
164 0.0
165 }
166}
167
168fn determine_compliance_level(score: f64) -> String {
170 if score >= 0.8 {
171 "Excellent".to_string()
172 } else if score >= 0.6 {
173 "Good".to_string()
174 } else if score >= 0.4 {
175 "Fair".to_string()
176 } else if score >= 0.2 {
177 "Poor".to_string()
178 } else {
179 "Very Poor".to_string()
180 }
181}
182
183#[derive(Debug, Clone)]
185pub struct QualityMetric {
186 pub name: String,
187 pub score: f64,
188 pub description: String,
189}
190
191#[derive(Debug, Clone)]
193pub struct ZipfQualityReport {
194 pub overall_score: f64,
195 pub quality_metrics: Vec<QualityMetric>,
196 pub compliance_level: String,
197}
198
199#[cfg(test)]
200mod tests {
201 use super::*;
202
203 #[test]
204 fn test_text_zipf_analysis() {
205 let text = "the quick brown fox jumps over the lazy dog the fox is quick";
206 let result = analyze_text_zipf(text, "sample_text").unwrap();
207
208 assert!(result.numbers_analyzed > 0);
209 assert!(result.unique_items > 0);
210 assert!(result.total_observations > 0);
211 }
212
213 #[test]
214 fn test_numeric_zipf_analysis() {
215 let numbers = vec![
216 1000.0, 500.0, 333.0, 250.0, 200.0, 166.0, 142.0, 125.0, 111.0, 100.0,
217 ];
218 let result = analyze_numeric_zipf(&numbers, "numeric_test").unwrap();
219
220 assert_eq!(result.numbers_analyzed, 10);
221 assert!(result.zipf_exponent > 0.0);
222 }
223
224 #[test]
225 fn test_multilingual_tokenization() {
226 let text = "Hello 世界 测试 مرحبا";
227 let tokens = tokenize_multilingual_text(text);
228
229 assert!(!tokens.is_empty());
230 assert!(tokens.contains(&"Hello".to_string()));
231 assert!(tokens.contains(&"世".to_string()));
232 assert!(tokens.contains(&"界".to_string()));
233 }
234
235 #[test]
236 fn test_zipf_quality_evaluation() {
237 let frequencies = vec![100.0, 50.0, 33.0, 25.0, 20.0];
238 let result = analyze_zipf_distribution(&frequencies, "test").unwrap();
239 let quality_report = evaluate_zipf_quality(&result);
240
241 assert!(quality_report.overall_score >= 0.0);
242 assert!(quality_report.overall_score <= 1.0);
243 }
244
245 #[test]
246 fn test_combined_zipf_analysis() {
247 let dataset1 = vec![100.0, 50.0, 33.0, 25.0, 20.0];
248 let dataset2 = vec![200.0, 100.0, 66.0, 50.0, 40.0];
249 let datasets = vec![
250 ("dataset1", dataset1.as_slice()),
251 ("dataset2", dataset2.as_slice()),
252 ];
253
254 let results = analyze_combined_zipf(&datasets).unwrap();
255 assert_eq!(results.len(), 2);
256 }
257}