lawkit_python/subcommands/
zipf.rs1use crate::colors;
2use crate::common_options::get_optimized_reader;
3use clap::ArgMatches;
4use lawkit_core::{
5 common::{
6 filtering::{apply_number_filter, NumberFilter},
7 input::{parse_input_auto, parse_text_input},
8 memory::{streaming_zipf_analysis, MemoryConfig},
9 risk::RiskLevel,
10 streaming_io::OptimizedFileReader,
11 },
12 error::{BenfError, Result},
13 laws::zipf::{
14 analyze_numeric_zipf, analyze_text_zipf, analyze_text_zipf_from_frequencies, ZipfResult,
15 },
16};
17
18pub fn run(matches: &ArgMatches) -> Result<()> {
19 let is_text_mode = matches.get_flag("text");
20
21 if matches.get_flag("verbose") {
23 eprintln!(
24 "Debug: input argument = {:?}",
25 matches.get_one::<String>("input")
26 );
27 eprintln!("Debug: text mode = {is_text_mode}");
28 }
29
30 if let Some(input) = matches.get_one::<String>("input") {
31 if is_text_mode {
33 let buffer = if input == "-" {
35 match get_optimized_reader(None) {
36 Ok(data) => data,
37 Err(e) => {
38 eprintln!("Error reading input: {e}");
39 std::process::exit(1);
40 }
41 }
42 } else {
43 match get_optimized_reader(Some(input)) {
44 Ok(data) => data,
45 Err(e) => {
46 eprintln!("Error reading input: {e}");
47 std::process::exit(1);
48 }
49 }
50 };
51
52 match analyze_text_zipf(&buffer, input) {
53 Ok(result) => {
54 output_results(matches, &result);
55 std::process::exit(result.risk_level.exit_code());
56 }
57 Err(e) => {
58 eprintln!("Analysis error: {e}");
59 std::process::exit(1);
60 }
61 }
62 } else {
63 match parse_input_auto(input) {
65 Ok(numbers) => {
66 if numbers.is_empty() {
67 eprintln!("Error: No valid numbers found in input");
68 std::process::exit(1);
69 }
70
71 let result =
72 match analyze_numbers_with_options(matches, input.to_string(), &numbers) {
73 Ok(result) => result,
74 Err(e) => {
75 eprintln!("Analysis error: {e}");
76 std::process::exit(1);
77 }
78 };
79
80 output_results(matches, &result);
81 std::process::exit(result.risk_level.exit_code());
82 }
83 Err(e) => {
84 eprintln!("Error processing input '{input}': {e}");
85 std::process::exit(1);
86 }
87 }
88 }
89 } else {
90 if matches.get_flag("verbose") {
92 eprintln!("Debug: Reading from stdin, using automatic optimization");
93 eprintln!(
94 "Debug: Using automatic optimization (streaming + incremental + memory efficiency)"
95 );
96 }
97
98 if is_text_mode {
99 let mut reader = OptimizedFileReader::from_stdin();
101 let memory_config = MemoryConfig::default();
102
103 let mut words = Vec::new();
105 match reader.read_lines_streaming(|line: String| {
106 let line_words: Vec<String> =
108 line.split_whitespace().map(|s| s.to_string()).collect();
109 words.extend(line_words);
110 Ok(None::<()>)
111 }) {
112 Ok(_) => {}
113 Err(e) => {
114 eprintln!("Error reading stream: {e}");
115 std::process::exit(1);
116 }
117 }
118
119 if matches.get_flag("verbose") {
120 eprintln!("Debug: Collected {} words from stream", words.len());
121 }
122
123 let chunk_result = match streaming_zipf_analysis(words.into_iter(), &memory_config) {
125 Ok(result) => {
126 if matches.get_flag("verbose") {
127 eprintln!(
128 "Debug: Streaming analysis successful - {} items processed",
129 result.total_items
130 );
131 }
132 result
133 }
134 Err(e) => {
135 eprintln!("Streaming analysis error: {e}");
136 std::process::exit(1);
137 }
138 };
139
140 if matches.get_flag("verbose") {
141 eprintln!(
142 "Debug: Processed {} items in {} chunks",
143 chunk_result.total_items, chunk_result.chunks_processed
144 );
145 eprintln!("Debug: Memory used: {:.2} MB", chunk_result.memory_used_mb);
146 eprintln!(
147 "Debug: Processing time: {} ms",
148 chunk_result.processing_time_ms
149 );
150 }
151
152 let frequencies = chunk_result.result.get_sorted_frequencies();
154 let result = match analyze_text_zipf_from_frequencies(&frequencies, "stdin") {
155 Ok(result) => result,
156 Err(e) => {
157 eprintln!("Analysis error: {e}");
158 std::process::exit(1);
159 }
160 };
161
162 output_results(matches, &result);
163 std::process::exit(result.risk_level.exit_code());
164 } else {
165 let buffer = match get_optimized_reader(None) {
167 Ok(data) => data,
168 Err(e) => {
169 eprintln!("Error reading input: {e}");
170 std::process::exit(1);
171 }
172 };
173 let numbers = match parse_text_input(&buffer) {
174 Ok(numbers) => {
175 if matches.get_flag("verbose") {
176 eprintln!("Debug: Collected {} numbers from input", numbers.len());
177 }
178 numbers
179 }
180 Err(e) => {
181 eprintln!("Analysis error: {e}");
182 std::process::exit(1);
183 }
184 };
185
186 if numbers.is_empty() {
187 eprintln!("Error: No valid numbers found in input");
188 std::process::exit(1);
189 }
190
191 let result = match analyze_numbers_with_options(matches, "stdin".to_string(), &numbers)
192 {
193 Ok(result) => result,
194 Err(e) => {
195 eprintln!("Analysis error: {e}");
196 std::process::exit(1);
197 }
198 };
199
200 output_results(matches, &result);
201 std::process::exit(result.risk_level.exit_code());
202 }
203 }
204}
205
206fn output_results(matches: &clap::ArgMatches, result: &ZipfResult) {
207 let format = matches.get_one::<String>("format").unwrap();
208 let quiet = matches.get_flag("quiet");
209 let verbose = matches.get_flag("verbose");
210
211 match format.as_str() {
212 "text" => print_text_output(result, quiet, verbose),
213 "json" => print_json_output(result),
214 "csv" => print_csv_output(result),
215 "yaml" => print_yaml_output(result),
216 "toml" => print_toml_output(result),
217 "xml" => print_xml_output(result),
218 _ => {
219 eprintln!("Error: Unsupported output format: {format}");
220 std::process::exit(2);
221 }
222 }
223}
224
225fn print_text_output(result: &ZipfResult, quiet: bool, verbose: bool) {
226 if quiet {
227 println!("zipf_exponent: {:.3}", result.zipf_exponent);
228 println!("correlation: {:.3}", result.correlation_coefficient);
229 println!("distribution_quality: {:.3}", result.distribution_quality);
230 return;
231 }
232
233 println!("Zipf Law Analysis Results");
234 println!();
235 println!("Dataset: {}", result.dataset_name);
236 println!("Numbers analyzed: {}", result.numbers_analyzed);
237 match result.risk_level {
238 RiskLevel::Critical => println!("{}", colors::level_critical("Dataset analysis")),
239 RiskLevel::High => println!("{}", colors::level_high("Dataset analysis")),
240 RiskLevel::Medium => println!("{}", colors::level_medium("Dataset analysis")),
241 RiskLevel::Low => println!("{}", colors::level_low("Dataset analysis")),
242 }
243
244 println!();
245 println!("Rank-Frequency Distribution:");
246 println!("{}", format_rank_frequency_chart(result));
247
248 if verbose {
249 println!();
250 println!("Zipf Metrics:");
251 println!(" Zipf exponent: {:.3}", result.zipf_exponent);
252 println!(
253 " Correlation coefficient: {:.3}",
254 result.correlation_coefficient
255 );
256 println!(" Distribution quality: {:.3}", result.distribution_quality);
257 println!(" Power law fit: {:.3}", result.power_law_fit);
258
259 println!();
260 println!("Distribution Statistics:");
261 println!(" Total observations: {}", result.total_observations);
262 println!(" Unique items: {}", result.unique_items);
263 println!(" Top item frequency: {:.1}%", result.top_item_frequency);
264 println!(" Concentration index: {:.3}", result.concentration_index);
265 println!(" Diversity index (Shannon): {:.3}", result.diversity_index);
266
267 println!();
268 println!("Interpretation:");
269 print_zipf_interpretation(result);
270 }
271}
272
273fn print_zipf_interpretation(result: &ZipfResult) {
274 use lawkit_core::common::risk::RiskLevel;
275
276 match result.risk_level {
277 RiskLevel::Low => {
278 println!("[PASS] Ideal Zipf distribution - follows Zipf's law");
279 println!(" Distribution follows the expected 1/rank pattern");
280 }
281 RiskLevel::Medium => {
282 println!("[WARN] Slight deviation from Zipf's law");
283 println!(" Monitoring recommended for distribution pattern");
284 }
285 RiskLevel::High => {
286 println!("[FAIL] Significant deviation from Zipf's law");
287 println!(" Consider rebalancing distribution");
288 }
289 RiskLevel::Critical => {
290 println!("[CRITICAL] Critical deviation from Zipf's law");
291 println!(" Distribution strategy review needed");
292 }
293 }
294
295 if result.zipf_exponent > 1.5 {
297 println!(" INFO: High concentration - extreme dominance pattern");
298 } else if result.zipf_exponent < 0.5 {
299 println!(" INFO: Low concentration - more uniform distribution");
300 }
301
302 if result.correlation_coefficient < 0.5 {
304 println!(" ALERT: Poor fit to Zipf's law - irregular distribution");
305 } else if result.correlation_coefficient > 0.8 {
306 println!(" INFO: Excellent fit to Zipf's law");
307 }
308}
309
310fn print_json_output(result: &ZipfResult) {
311 use serde_json::json;
312
313 let output = json!({
314 "dataset": result.dataset_name,
315 "numbers_analyzed": result.numbers_analyzed,
316 "risk_level": format!("{:?}", result.risk_level),
317 "zipf_exponent": result.zipf_exponent,
318 "correlation_coefficient": result.correlation_coefficient,
319 "distribution_quality": result.distribution_quality,
320 "total_observations": result.total_observations,
321 "unique_items": result.unique_items,
322 "top_item_frequency": result.top_item_frequency,
323 "concentration_index": result.concentration_index,
324 "diversity_index": result.diversity_index,
325 "power_law_fit": result.power_law_fit,
326 "rank_frequency_pairs": result.rank_frequency_pairs
327 });
328
329 println!("{}", serde_json::to_string_pretty(&output).unwrap());
330}
331
332fn print_csv_output(result: &ZipfResult) {
333 println!("dataset,numbers_analyzed,risk_level,zipf_exponent,correlation_coefficient,distribution_quality,power_law_fit");
334 println!(
335 "{},{},{:?},{:.3},{:.3},{:.3},{:.3}",
336 result.dataset_name,
337 result.numbers_analyzed,
338 result.risk_level,
339 result.zipf_exponent,
340 result.correlation_coefficient,
341 result.distribution_quality,
342 result.power_law_fit
343 );
344}
345
346fn print_yaml_output(result: &ZipfResult) {
347 println!("dataset: \"{}\"", result.dataset_name);
348 println!("numbers_analyzed: {}", result.numbers_analyzed);
349 println!("risk_level: \"{:?}\"", result.risk_level);
350 println!("zipf_exponent: {:.3}", result.zipf_exponent);
351 println!(
352 "correlation_coefficient: {:.3}",
353 result.correlation_coefficient
354 );
355 println!("distribution_quality: {:.3}", result.distribution_quality);
356 println!("power_law_fit: {:.3}", result.power_law_fit);
357}
358
359fn print_toml_output(result: &ZipfResult) {
360 println!("dataset = \"{}\"", result.dataset_name);
361 println!("numbers_analyzed = {}", result.numbers_analyzed);
362 println!("risk_level = \"{:?}\"", result.risk_level);
363 println!("zipf_exponent = {:.3}", result.zipf_exponent);
364 println!(
365 "correlation_coefficient = {:.3}",
366 result.correlation_coefficient
367 );
368 println!("distribution_quality = {:.3}", result.distribution_quality);
369 println!("power_law_fit = {:.3}", result.power_law_fit);
370}
371
372fn print_xml_output(result: &ZipfResult) {
373 println!("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
374 println!("<zipf_analysis>");
375 println!(" <dataset>{}</dataset>", result.dataset_name);
376 println!(
377 " <numbers_analyzed>{}</numbers_analyzed>",
378 result.numbers_analyzed
379 );
380 println!(" <risk_level>{:?}</risk_level>", result.risk_level);
381 println!(
382 " <zipf_exponent>{:.3}</zipf_exponent>",
383 result.zipf_exponent
384 );
385 println!(
386 " <correlation_coefficient>{:.3}</correlation_coefficient>",
387 result.correlation_coefficient
388 );
389 println!(
390 " <distribution_quality>{:.3}</distribution_quality>",
391 result.distribution_quality
392 );
393 println!(
394 " <power_law_fit>{:.3}</power_law_fit>",
395 result.power_law_fit
396 );
397 println!("</zipf_analysis>");
398}
399
400fn analyze_numbers_with_options(
402 matches: &clap::ArgMatches,
403 dataset_name: String,
404 numbers: &[f64],
405) -> Result<ZipfResult> {
406 let filtered_numbers = if let Some(filter_str) = matches.get_one::<String>("filter") {
408 let filter = NumberFilter::parse(filter_str)
409 .map_err(|e| BenfError::ParseError(format!("無効なフィルタ: {e}")))?;
410
411 let filtered = apply_number_filter(numbers, &filter);
412
413 if filtered.len() != numbers.len() {
415 eprintln!(
416 "フィルタリング結果: {} 個の数値が {} 個に絞り込まれました ({})",
417 numbers.len(),
418 filtered.len(),
419 filter.description()
420 );
421 }
422
423 filtered
424 } else {
425 numbers.to_vec()
426 };
427
428 let min_count = if let Some(min_count_str) = matches.get_one::<String>("min-count") {
430 min_count_str
431 .parse::<usize>()
432 .map_err(|_| BenfError::ParseError("無効な最小数値数".to_string()))?
433 } else {
434 5
435 };
436
437 if filtered_numbers.len() < min_count {
439 return Err(BenfError::InsufficientData(filtered_numbers.len()));
440 }
441
442 analyze_numeric_zipf(&filtered_numbers, &dataset_name)
444}
445
446fn format_rank_frequency_chart(result: &ZipfResult) -> String {
447 let mut output = String::new();
448 const CHART_WIDTH: usize = 50;
449
450 if result.rank_frequency_pairs.is_empty() {
451 return "No data available for chart".to_string();
452 }
453
454 let max_frequency = result
456 .rank_frequency_pairs
457 .iter()
458 .map(|(_, freq)| *freq)
459 .fold(0.0, f64::max);
460
461 if max_frequency == 0.0 {
462 return "All frequencies are zero".to_string();
463 }
464
465 for (rank, frequency) in result.rank_frequency_pairs.iter().take(10) {
467 let normalized_freq = frequency / max_frequency;
468 let bar_length = (normalized_freq * CHART_WIDTH as f64).round() as usize;
469 let bar_length = bar_length.min(CHART_WIDTH);
470
471 let expected_freq = max_frequency / *rank as f64;
473 let expected_normalized = expected_freq / max_frequency;
474 let expected_line_pos = (expected_normalized * CHART_WIDTH as f64).round() as usize;
475 let expected_line_pos = expected_line_pos.min(CHART_WIDTH - 1);
476
477 let mut bar_chars = Vec::new();
479 for pos in 0..CHART_WIDTH {
480 if pos == expected_line_pos {
481 bar_chars.push('┃'); } else if pos < bar_length {
483 bar_chars.push('█'); } else {
485 bar_chars.push('░'); }
487 }
488 let full_bar: String = bar_chars.iter().collect();
489
490 let percentage = (frequency / result.total_observations as f64) * 100.0;
492 let expected_percentage = (expected_freq / result.total_observations as f64) * 100.0;
493
494 output.push_str(&format!(
495 "#{rank:2}: {full_bar} {percentage:>6.2}% (expected: {expected_percentage:.2}%)\n"
496 ));
497 }
498
499 output.push_str(&format!(
501 "\nZipf Exponent: {:.3} (ideal: 1.0), Correlation: {:.3}",
502 result.zipf_exponent, result.correlation_coefficient
503 ));
504
505 output
506}