lawkit_python/subcommands/
zipf.rs1use crate::colors;
2use crate::common_options::get_optimized_reader;
3use clap::ArgMatches;
4use lawkit_core::{
5 common::{
6 filtering::{apply_number_filter, NumberFilter},
7 input::{parse_input_auto, parse_text_input},
8 memory::{streaming_zipf_analysis, MemoryConfig},
9 risk::RiskLevel,
10 streaming_io::OptimizedFileReader,
11 },
12 error::{BenfError, Result},
13 laws::zipf::{
14 analyze_numeric_zipf, analyze_text_zipf, analyze_text_zipf_from_frequencies, ZipfResult,
15 },
16};
17
18pub fn run(matches: &ArgMatches) -> Result<()> {
19 let is_text_mode = matches.get_flag("text");
20
21 if matches.get_flag("verbose") {
23 eprintln!(
24 "Debug: input argument = {:?}",
25 matches.get_one::<String>("input")
26 );
27 eprintln!("Debug: text mode = {is_text_mode}");
28 }
29
30 if let Some(input) = matches.get_one::<String>("input") {
31 if is_text_mode {
33 let buffer = if input == "-" {
35 match get_optimized_reader(None) {
36 Ok(data) => data,
37 Err(e) => {
38 eprintln!("Error reading input: {e}");
39 std::process::exit(1);
40 }
41 }
42 } else {
43 match get_optimized_reader(Some(input)) {
44 Ok(data) => data,
45 Err(e) => {
46 eprintln!("Error reading input: {e}");
47 std::process::exit(1);
48 }
49 }
50 };
51
52 match analyze_text_zipf(&buffer, input) {
53 Ok(result) => {
54 output_results(matches, &result);
55 std::process::exit(result.risk_level.exit_code());
56 }
57 Err(e) => {
58 eprintln!("Analysis error: {e}");
59 std::process::exit(1);
60 }
61 }
62 } else {
63 match parse_input_auto(input) {
65 Ok(numbers) => {
66 if numbers.is_empty() {
67 eprintln!("Error: No valid numbers found in input");
68 std::process::exit(1);
69 }
70
71 let result =
72 match analyze_numbers_with_options(matches, input.to_string(), &numbers) {
73 Ok(result) => result,
74 Err(e) => {
75 eprintln!("Analysis error: {e}");
76 std::process::exit(1);
77 }
78 };
79
80 output_results(matches, &result);
81 std::process::exit(result.risk_level.exit_code());
82 }
83 Err(e) => {
84 eprintln!("Error processing input '{input}': {e}");
85 std::process::exit(1);
86 }
87 }
88 }
89 } else {
90 if matches.get_flag("verbose") {
92 eprintln!("Debug: Reading from stdin, using automatic optimization");
93 eprintln!(
94 "Debug: Using automatic optimization (streaming + incremental + memory efficiency)"
95 );
96 }
97
98 if is_text_mode {
99 let mut reader = OptimizedFileReader::from_stdin();
101 let memory_config = MemoryConfig::default();
102
103 let mut words = Vec::new();
105 match reader.read_lines_streaming(|line: String| {
106 let line_words: Vec<String> =
108 line.split_whitespace().map(|s| s.to_string()).collect();
109 words.extend(line_words);
110 Ok(None::<()>)
111 }) {
112 Ok(_) => {}
113 Err(e) => {
114 eprintln!("Error reading stream: {e}");
115 std::process::exit(1);
116 }
117 }
118
119 if matches.get_flag("verbose") {
120 eprintln!("Debug: Collected {} words from stream", words.len());
121 }
122
123 let chunk_result = match streaming_zipf_analysis(words.into_iter(), &memory_config) {
125 Ok(result) => {
126 if matches.get_flag("verbose") {
127 eprintln!(
128 "Debug: Streaming analysis successful - {} items processed",
129 result.total_items
130 );
131 }
132 result
133 }
134 Err(e) => {
135 eprintln!("Streaming analysis error: {e}");
136 std::process::exit(1);
137 }
138 };
139
140 if matches.get_flag("verbose") {
141 eprintln!(
142 "Debug: Processed {} items in {} chunks",
143 chunk_result.total_items, chunk_result.chunks_processed
144 );
145 eprintln!("Debug: Memory used: {:.2} MB", chunk_result.memory_used_mb);
146 eprintln!(
147 "Debug: Processing time: {} ms",
148 chunk_result.processing_time_ms
149 );
150 }
151
152 let frequencies = chunk_result.result.get_sorted_frequencies();
154 let result = match analyze_text_zipf_from_frequencies(&frequencies, "stdin") {
155 Ok(result) => result,
156 Err(e) => {
157 eprintln!("Analysis error: {e}");
158 std::process::exit(1);
159 }
160 };
161
162 output_results(matches, &result);
163 std::process::exit(result.risk_level.exit_code());
164 } else {
165 let buffer = match get_optimized_reader(None) {
167 Ok(data) => data,
168 Err(e) => {
169 eprintln!("Error reading input: {e}");
170 std::process::exit(1);
171 }
172 };
173 let numbers = match parse_text_input(&buffer) {
174 Ok(numbers) => {
175 if matches.get_flag("verbose") {
176 eprintln!("Debug: Collected {} numbers from input", numbers.len());
177 }
178 numbers
179 }
180 Err(e) => {
181 eprintln!("Analysis error: {e}");
182 std::process::exit(1);
183 }
184 };
185
186 if numbers.is_empty() {
187 eprintln!("Error: No valid numbers found in input");
188 std::process::exit(1);
189 }
190
191 let result = match analyze_numbers_with_options(matches, "stdin".to_string(), &numbers)
192 {
193 Ok(result) => result,
194 Err(e) => {
195 eprintln!("Analysis error: {e}");
196 std::process::exit(1);
197 }
198 };
199
200 output_results(matches, &result);
201 std::process::exit(result.risk_level.exit_code());
202 }
203 }
204}
205
206fn output_results(matches: &clap::ArgMatches, result: &ZipfResult) {
207 let format = matches.get_one::<String>("format").unwrap();
208 let quiet = matches.get_flag("quiet");
209 let verbose = matches.get_flag("verbose");
210 let no_color = matches.get_flag("no-color");
211
212 match format.as_str() {
213 "text" => print_text_output(result, quiet, verbose, no_color),
214 "json" => print_json_output(result),
215 "csv" => print_csv_output(result),
216 "yaml" => print_yaml_output(result),
217 "toml" => print_toml_output(result),
218 "xml" => print_xml_output(result),
219 _ => {
220 eprintln!("Error: Unsupported output format: {format}");
221 std::process::exit(2);
222 }
223 }
224}
225
226fn print_text_output(result: &ZipfResult, quiet: bool, verbose: bool, no_color: bool) {
227 if quiet {
228 println!("zipf_exponent: {:.3}", result.zipf_exponent);
229 println!("correlation: {:.3}", result.correlation_coefficient);
230 println!("distribution_quality: {:.3}", result.distribution_quality);
231 return;
232 }
233
234 println!("Zipf Law Analysis Results");
235 println!();
236 println!("Dataset: {}", result.dataset_name);
237 println!("Numbers analyzed: {}", result.numbers_analyzed);
238 match result.risk_level {
239 RiskLevel::Critical => println!("{}", colors::level_critical("Dataset analysis", no_color)),
240 RiskLevel::High => println!("{}", colors::level_high("Dataset analysis", no_color)),
241 RiskLevel::Medium => println!("{}", colors::level_medium("Dataset analysis", no_color)),
242 RiskLevel::Low => println!("{}", colors::level_low("Dataset analysis", no_color)),
243 }
244
245 println!();
246 println!("Rank-Frequency Distribution:");
247 println!("{}", format_rank_frequency_chart(result));
248
249 if verbose {
250 println!();
251 println!("Zipf Metrics:");
252 println!(" Zipf exponent: {:.3}", result.zipf_exponent);
253 println!(
254 " Correlation coefficient: {:.3}",
255 result.correlation_coefficient
256 );
257 println!(" Distribution quality: {:.3}", result.distribution_quality);
258 println!(" Power law fit: {:.3}", result.power_law_fit);
259
260 println!();
261 println!("Distribution Statistics:");
262 println!(" Total observations: {}", result.total_observations);
263 println!(" Unique items: {}", result.unique_items);
264 println!(" Top item frequency: {:.1}%", result.top_item_frequency);
265 println!(" Concentration index: {:.3}", result.concentration_index);
266 println!(" Diversity index (Shannon): {:.3}", result.diversity_index);
267
268 println!();
269 println!("Interpretation:");
270 print_zipf_interpretation(result);
271 }
272}
273
274fn print_zipf_interpretation(result: &ZipfResult) {
275 use lawkit_core::common::risk::RiskLevel;
276
277 match result.risk_level {
278 RiskLevel::Low => {
279 println!("[PASS] Ideal Zipf distribution - follows Zipf's law");
280 println!(" Distribution follows the expected 1/rank pattern");
281 }
282 RiskLevel::Medium => {
283 println!("[WARN] Slight deviation from Zipf's law");
284 println!(" Monitoring recommended for distribution pattern");
285 }
286 RiskLevel::High => {
287 println!("[FAIL] Significant deviation from Zipf's law");
288 println!(" Consider rebalancing distribution");
289 }
290 RiskLevel::Critical => {
291 println!("[CRITICAL] Critical deviation from Zipf's law");
292 println!(" Distribution strategy review needed");
293 }
294 }
295
296 if result.zipf_exponent > 1.5 {
298 println!(" INFO: High concentration - extreme dominance pattern");
299 } else if result.zipf_exponent < 0.5 {
300 println!(" INFO: Low concentration - more uniform distribution");
301 }
302
303 if result.correlation_coefficient < 0.5 {
305 println!(" ALERT: Poor fit to Zipf's law - irregular distribution");
306 } else if result.correlation_coefficient > 0.8 {
307 println!(" INFO: Excellent fit to Zipf's law");
308 }
309}
310
311fn print_json_output(result: &ZipfResult) {
312 use serde_json::json;
313
314 let output = json!({
315 "dataset": result.dataset_name,
316 "numbers_analyzed": result.numbers_analyzed,
317 "risk_level": format!("{:?}", result.risk_level),
318 "zipf_exponent": result.zipf_exponent,
319 "correlation_coefficient": result.correlation_coefficient,
320 "distribution_quality": result.distribution_quality,
321 "total_observations": result.total_observations,
322 "unique_items": result.unique_items,
323 "top_item_frequency": result.top_item_frequency,
324 "concentration_index": result.concentration_index,
325 "diversity_index": result.diversity_index,
326 "power_law_fit": result.power_law_fit,
327 "rank_frequency_pairs": result.rank_frequency_pairs
328 });
329
330 println!("{}", serde_json::to_string_pretty(&output).unwrap());
331}
332
333fn print_csv_output(result: &ZipfResult) {
334 println!("dataset,numbers_analyzed,risk_level,zipf_exponent,correlation_coefficient,distribution_quality,power_law_fit");
335 println!(
336 "{},{},{:?},{:.3},{:.3},{:.3},{:.3}",
337 result.dataset_name,
338 result.numbers_analyzed,
339 result.risk_level,
340 result.zipf_exponent,
341 result.correlation_coefficient,
342 result.distribution_quality,
343 result.power_law_fit
344 );
345}
346
347fn print_yaml_output(result: &ZipfResult) {
348 println!("dataset: \"{}\"", result.dataset_name);
349 println!("numbers_analyzed: {}", result.numbers_analyzed);
350 println!("risk_level: \"{:?}\"", result.risk_level);
351 println!("zipf_exponent: {:.3}", result.zipf_exponent);
352 println!(
353 "correlation_coefficient: {:.3}",
354 result.correlation_coefficient
355 );
356 println!("distribution_quality: {:.3}", result.distribution_quality);
357 println!("power_law_fit: {:.3}", result.power_law_fit);
358}
359
360fn print_toml_output(result: &ZipfResult) {
361 println!("dataset = \"{}\"", result.dataset_name);
362 println!("numbers_analyzed = {}", result.numbers_analyzed);
363 println!("risk_level = \"{:?}\"", result.risk_level);
364 println!("zipf_exponent = {:.3}", result.zipf_exponent);
365 println!(
366 "correlation_coefficient = {:.3}",
367 result.correlation_coefficient
368 );
369 println!("distribution_quality = {:.3}", result.distribution_quality);
370 println!("power_law_fit = {:.3}", result.power_law_fit);
371}
372
373fn print_xml_output(result: &ZipfResult) {
374 println!("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
375 println!("<zipf_analysis>");
376 println!(" <dataset>{}</dataset>", result.dataset_name);
377 println!(
378 " <numbers_analyzed>{}</numbers_analyzed>",
379 result.numbers_analyzed
380 );
381 println!(" <risk_level>{:?}</risk_level>", result.risk_level);
382 println!(
383 " <zipf_exponent>{:.3}</zipf_exponent>",
384 result.zipf_exponent
385 );
386 println!(
387 " <correlation_coefficient>{:.3}</correlation_coefficient>",
388 result.correlation_coefficient
389 );
390 println!(
391 " <distribution_quality>{:.3}</distribution_quality>",
392 result.distribution_quality
393 );
394 println!(
395 " <power_law_fit>{:.3}</power_law_fit>",
396 result.power_law_fit
397 );
398 println!("</zipf_analysis>");
399}
400
401fn analyze_numbers_with_options(
403 matches: &clap::ArgMatches,
404 dataset_name: String,
405 numbers: &[f64],
406) -> Result<ZipfResult> {
407 let filtered_numbers = if let Some(filter_str) = matches.get_one::<String>("filter") {
409 let filter = NumberFilter::parse(filter_str)
410 .map_err(|e| BenfError::ParseError(format!("無効なフィルタ: {e}")))?;
411
412 let filtered = apply_number_filter(numbers, &filter);
413
414 if filtered.len() != numbers.len() {
416 eprintln!(
417 "フィルタリング結果: {} 個の数値が {} 個に絞り込まれました ({})",
418 numbers.len(),
419 filtered.len(),
420 filter.description()
421 );
422 }
423
424 filtered
425 } else {
426 numbers.to_vec()
427 };
428
429 let min_count = if let Some(min_count_str) = matches.get_one::<String>("min-count") {
431 min_count_str
432 .parse::<usize>()
433 .map_err(|_| BenfError::ParseError("無効な最小数値数".to_string()))?
434 } else {
435 5
436 };
437
438 if filtered_numbers.len() < min_count {
440 return Err(BenfError::InsufficientData(filtered_numbers.len()));
441 }
442
443 analyze_numeric_zipf(&filtered_numbers, &dataset_name)
445}
446
447fn format_rank_frequency_chart(result: &ZipfResult) -> String {
448 let mut output = String::new();
449 const CHART_WIDTH: usize = 50;
450
451 if result.rank_frequency_pairs.is_empty() {
452 return "No data available for chart".to_string();
453 }
454
455 let max_frequency = result
457 .rank_frequency_pairs
458 .iter()
459 .map(|(_, freq)| *freq)
460 .fold(0.0, f64::max);
461
462 if max_frequency == 0.0 {
463 return "All frequencies are zero".to_string();
464 }
465
466 for (rank, frequency) in result.rank_frequency_pairs.iter().take(10) {
468 let normalized_freq = frequency / max_frequency;
469 let bar_length = (normalized_freq * CHART_WIDTH as f64).round() as usize;
470 let bar_length = bar_length.min(CHART_WIDTH);
471
472 let expected_freq = max_frequency / *rank as f64;
474 let expected_normalized = expected_freq / max_frequency;
475 let expected_line_pos = (expected_normalized * CHART_WIDTH as f64).round() as usize;
476 let expected_line_pos = expected_line_pos.min(CHART_WIDTH - 1);
477
478 let mut bar_chars = Vec::new();
480 for pos in 0..CHART_WIDTH {
481 if pos == expected_line_pos {
482 bar_chars.push('┃'); } else if pos < bar_length {
484 bar_chars.push('█'); } else {
486 bar_chars.push('░'); }
488 }
489 let full_bar: String = bar_chars.iter().collect();
490
491 let percentage = (frequency / result.total_observations as f64) * 100.0;
493 let expected_percentage = (expected_freq / result.total_observations as f64) * 100.0;
494
495 output.push_str(&format!(
496 "#{rank:2}: {full_bar} {percentage:>6.2}% (expected: {expected_percentage:.2}%)\n"
497 ));
498 }
499
500 output.push_str(&format!(
502 "\nZipf Exponent: {:.3} (ideal: 1.0), Correlation: {:.3}",
503 result.zipf_exponent, result.correlation_coefficient
504 ));
505
506 output
507}