lawkit_python/subcommands/
zipf.rs

1use crate::colors;
2use crate::common_options::get_optimized_reader;
3use clap::ArgMatches;
4use lawkit_core::{
5    common::{
6        filtering::{apply_number_filter, NumberFilter},
7        input::{parse_input_auto, parse_text_input},
8        memory::{streaming_zipf_analysis, MemoryConfig},
9        risk::RiskLevel,
10        streaming_io::OptimizedFileReader,
11    },
12    error::{BenfError, Result},
13    laws::zipf::{
14        analyze_numeric_zipf, analyze_text_zipf, analyze_text_zipf_from_frequencies, ZipfResult,
15    },
16};
17
18pub fn run(matches: &ArgMatches) -> Result<()> {
19    let is_text_mode = matches.get_flag("text");
20
21    // Determine input source based on arguments
22    if matches.get_flag("verbose") {
23        eprintln!(
24            "Debug: input argument = {:?}",
25            matches.get_one::<String>("input")
26        );
27        eprintln!("Debug: text mode = {is_text_mode}");
28    }
29
30    if let Some(input) = matches.get_one::<String>("input") {
31        // Use auto-detection for file vs string input
32        if is_text_mode {
33            // Text mode: read file or use as text directly
34            let buffer = if input == "-" {
35                match get_optimized_reader(None) {
36                    Ok(data) => data,
37                    Err(e) => {
38                        eprintln!("Error reading input: {e}");
39                        std::process::exit(1);
40                    }
41                }
42            } else {
43                match get_optimized_reader(Some(input)) {
44                    Ok(data) => data,
45                    Err(e) => {
46                        eprintln!("Error reading input: {e}");
47                        std::process::exit(1);
48                    }
49                }
50            };
51
52            match analyze_text_zipf(&buffer, input) {
53                Ok(result) => {
54                    output_results(matches, &result);
55                    std::process::exit(result.risk_level.exit_code());
56                }
57                Err(e) => {
58                    eprintln!("Analysis error: {e}");
59                    std::process::exit(1);
60                }
61            }
62        } else {
63            // Numeric mode
64            match parse_input_auto(input) {
65                Ok(numbers) => {
66                    if numbers.is_empty() {
67                        eprintln!("Error: No valid numbers found in input");
68                        std::process::exit(1);
69                    }
70
71                    let result =
72                        match analyze_numbers_with_options(matches, input.to_string(), &numbers) {
73                            Ok(result) => result,
74                            Err(e) => {
75                                eprintln!("Analysis error: {e}");
76                                std::process::exit(1);
77                            }
78                        };
79
80                    output_results(matches, &result);
81                    std::process::exit(result.risk_level.exit_code());
82                }
83                Err(e) => {
84                    eprintln!("Error processing input '{input}': {e}");
85                    std::process::exit(1);
86                }
87            }
88        }
89    } else {
90        // Read from stdin - use automatic optimization based on data characteristics
91        if matches.get_flag("verbose") {
92            eprintln!("Debug: Reading from stdin, using automatic optimization");
93            eprintln!(
94                "Debug: Using automatic optimization (streaming + incremental + memory efficiency)"
95            );
96        }
97
98        if is_text_mode {
99            // Text mode with streaming
100            let mut reader = OptimizedFileReader::from_stdin();
101            let memory_config = MemoryConfig::default();
102
103            // Process text line by line and extract words
104            let mut words = Vec::new();
105            match reader.read_lines_streaming(|line: String| {
106                // Extract words from line
107                let line_words: Vec<String> =
108                    line.split_whitespace().map(|s| s.to_string()).collect();
109                words.extend(line_words);
110                Ok(None::<()>)
111            }) {
112                Ok(_) => {}
113                Err(e) => {
114                    eprintln!("Error reading stream: {e}");
115                    std::process::exit(1);
116                }
117            }
118
119            if matches.get_flag("verbose") {
120                eprintln!("Debug: Collected {} words from stream", words.len());
121            }
122
123            // Use streaming analysis
124            let chunk_result = match streaming_zipf_analysis(words.into_iter(), &memory_config) {
125                Ok(result) => {
126                    if matches.get_flag("verbose") {
127                        eprintln!(
128                            "Debug: Streaming analysis successful - {} items processed",
129                            result.total_items
130                        );
131                    }
132                    result
133                }
134                Err(e) => {
135                    eprintln!("Streaming analysis error: {e}");
136                    std::process::exit(1);
137                }
138            };
139
140            if matches.get_flag("verbose") {
141                eprintln!(
142                    "Debug: Processed {} items in {} chunks",
143                    chunk_result.total_items, chunk_result.chunks_processed
144                );
145                eprintln!("Debug: Memory used: {:.2} MB", chunk_result.memory_used_mb);
146                eprintln!(
147                    "Debug: Processing time: {} ms",
148                    chunk_result.processing_time_ms
149                );
150            }
151
152            // Convert IncrementalZipf to ZipfResult
153            let frequencies = chunk_result.result.get_sorted_frequencies();
154            let result = match analyze_text_zipf_from_frequencies(&frequencies, "stdin") {
155                Ok(result) => result,
156                Err(e) => {
157                    eprintln!("Analysis error: {e}");
158                    std::process::exit(1);
159                }
160            };
161
162            output_results(matches, &result);
163            std::process::exit(result.risk_level.exit_code());
164        } else {
165            // Numeric mode
166            let buffer = match get_optimized_reader(None) {
167                Ok(data) => data,
168                Err(e) => {
169                    eprintln!("Error reading input: {e}");
170                    std::process::exit(1);
171                }
172            };
173            let numbers = match parse_text_input(&buffer) {
174                Ok(numbers) => {
175                    if matches.get_flag("verbose") {
176                        eprintln!("Debug: Collected {} numbers from input", numbers.len());
177                    }
178                    numbers
179                }
180                Err(e) => {
181                    eprintln!("Analysis error: {e}");
182                    std::process::exit(1);
183                }
184            };
185
186            if numbers.is_empty() {
187                eprintln!("Error: No valid numbers found in input");
188                std::process::exit(1);
189            }
190
191            let result = match analyze_numbers_with_options(matches, "stdin".to_string(), &numbers)
192            {
193                Ok(result) => result,
194                Err(e) => {
195                    eprintln!("Analysis error: {e}");
196                    std::process::exit(1);
197                }
198            };
199
200            output_results(matches, &result);
201            std::process::exit(result.risk_level.exit_code());
202        }
203    }
204}
205
206fn output_results(matches: &clap::ArgMatches, result: &ZipfResult) {
207    let format = matches.get_one::<String>("format").unwrap();
208    let quiet = matches.get_flag("quiet");
209    let verbose = matches.get_flag("verbose");
210
211    match format.as_str() {
212        "text" => print_text_output(result, quiet, verbose),
213        "json" => print_json_output(result),
214        "csv" => print_csv_output(result),
215        "yaml" => print_yaml_output(result),
216        "toml" => print_toml_output(result),
217        "xml" => print_xml_output(result),
218        _ => {
219            eprintln!("Error: Unsupported output format: {format}");
220            std::process::exit(2);
221        }
222    }
223}
224
225fn print_text_output(result: &ZipfResult, quiet: bool, verbose: bool) {
226    if quiet {
227        println!("zipf_exponent: {:.3}", result.zipf_exponent);
228        println!("correlation: {:.3}", result.correlation_coefficient);
229        println!("distribution_quality: {:.3}", result.distribution_quality);
230        return;
231    }
232
233    println!("Zipf Law Analysis Results");
234    println!();
235    println!("Dataset: {}", result.dataset_name);
236    println!("Numbers analyzed: {}", result.numbers_analyzed);
237    match result.risk_level {
238        RiskLevel::Critical => println!("{}", colors::level_critical("Dataset analysis")),
239        RiskLevel::High => println!("{}", colors::level_high("Dataset analysis")),
240        RiskLevel::Medium => println!("{}", colors::level_medium("Dataset analysis")),
241        RiskLevel::Low => println!("{}", colors::level_low("Dataset analysis")),
242    }
243
244    println!();
245    println!("Rank-Frequency Distribution:");
246    println!("{}", format_rank_frequency_chart(result));
247
248    if verbose {
249        println!();
250        println!("Zipf Metrics:");
251        println!("  Zipf exponent: {:.3}", result.zipf_exponent);
252        println!(
253            "  Correlation coefficient: {:.3}",
254            result.correlation_coefficient
255        );
256        println!("  Distribution quality: {:.3}", result.distribution_quality);
257        println!("  Power law fit: {:.3}", result.power_law_fit);
258
259        println!();
260        println!("Distribution Statistics:");
261        println!("  Total observations: {}", result.total_observations);
262        println!("  Unique items: {}", result.unique_items);
263        println!("  Top item frequency: {:.1}%", result.top_item_frequency);
264        println!("  Concentration index: {:.3}", result.concentration_index);
265        println!("  Diversity index (Shannon): {:.3}", result.diversity_index);
266
267        println!();
268        println!("Interpretation:");
269        print_zipf_interpretation(result);
270    }
271}
272
273fn print_zipf_interpretation(result: &ZipfResult) {
274    use lawkit_core::common::risk::RiskLevel;
275
276    match result.risk_level {
277        RiskLevel::Low => {
278            println!("[PASS] Ideal Zipf distribution - follows Zipf's law");
279            println!("   Distribution follows the expected 1/rank pattern");
280        }
281        RiskLevel::Medium => {
282            println!("[WARN] Slight deviation from Zipf's law");
283            println!("   Monitoring recommended for distribution pattern");
284        }
285        RiskLevel::High => {
286            println!("[FAIL] Significant deviation from Zipf's law");
287            println!("   Consider rebalancing distribution");
288        }
289        RiskLevel::Critical => {
290            println!("[CRITICAL] Critical deviation from Zipf's law");
291            println!("   Distribution strategy review needed");
292        }
293    }
294
295    // Zipf指数に基づく解釈
296    if result.zipf_exponent > 1.5 {
297        println!("   INFO: High concentration - extreme dominance pattern");
298    } else if result.zipf_exponent < 0.5 {
299        println!("   INFO: Low concentration - more uniform distribution");
300    }
301
302    // 相関係数に基づく解釈
303    if result.correlation_coefficient < 0.5 {
304        println!("   ALERT: Poor fit to Zipf's law - irregular distribution");
305    } else if result.correlation_coefficient > 0.8 {
306        println!("   INFO: Excellent fit to Zipf's law");
307    }
308}
309
310fn print_json_output(result: &ZipfResult) {
311    use serde_json::json;
312
313    let output = json!({
314        "dataset": result.dataset_name,
315        "numbers_analyzed": result.numbers_analyzed,
316        "risk_level": format!("{:?}", result.risk_level),
317        "zipf_exponent": result.zipf_exponent,
318        "correlation_coefficient": result.correlation_coefficient,
319        "distribution_quality": result.distribution_quality,
320        "total_observations": result.total_observations,
321        "unique_items": result.unique_items,
322        "top_item_frequency": result.top_item_frequency,
323        "concentration_index": result.concentration_index,
324        "diversity_index": result.diversity_index,
325        "power_law_fit": result.power_law_fit,
326        "rank_frequency_pairs": result.rank_frequency_pairs
327    });
328
329    println!("{}", serde_json::to_string_pretty(&output).unwrap());
330}
331
332fn print_csv_output(result: &ZipfResult) {
333    println!("dataset,numbers_analyzed,risk_level,zipf_exponent,correlation_coefficient,distribution_quality,power_law_fit");
334    println!(
335        "{},{},{:?},{:.3},{:.3},{:.3},{:.3}",
336        result.dataset_name,
337        result.numbers_analyzed,
338        result.risk_level,
339        result.zipf_exponent,
340        result.correlation_coefficient,
341        result.distribution_quality,
342        result.power_law_fit
343    );
344}
345
346fn print_yaml_output(result: &ZipfResult) {
347    println!("dataset: \"{}\"", result.dataset_name);
348    println!("numbers_analyzed: {}", result.numbers_analyzed);
349    println!("risk_level: \"{:?}\"", result.risk_level);
350    println!("zipf_exponent: {:.3}", result.zipf_exponent);
351    println!(
352        "correlation_coefficient: {:.3}",
353        result.correlation_coefficient
354    );
355    println!("distribution_quality: {:.3}", result.distribution_quality);
356    println!("power_law_fit: {:.3}", result.power_law_fit);
357}
358
359fn print_toml_output(result: &ZipfResult) {
360    println!("dataset = \"{}\"", result.dataset_name);
361    println!("numbers_analyzed = {}", result.numbers_analyzed);
362    println!("risk_level = \"{:?}\"", result.risk_level);
363    println!("zipf_exponent = {:.3}", result.zipf_exponent);
364    println!(
365        "correlation_coefficient = {:.3}",
366        result.correlation_coefficient
367    );
368    println!("distribution_quality = {:.3}", result.distribution_quality);
369    println!("power_law_fit = {:.3}", result.power_law_fit);
370}
371
372fn print_xml_output(result: &ZipfResult) {
373    println!("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
374    println!("<zipf_analysis>");
375    println!("  <dataset>{}</dataset>", result.dataset_name);
376    println!(
377        "  <numbers_analyzed>{}</numbers_analyzed>",
378        result.numbers_analyzed
379    );
380    println!("  <risk_level>{:?}</risk_level>", result.risk_level);
381    println!(
382        "  <zipf_exponent>{:.3}</zipf_exponent>",
383        result.zipf_exponent
384    );
385    println!(
386        "  <correlation_coefficient>{:.3}</correlation_coefficient>",
387        result.correlation_coefficient
388    );
389    println!(
390        "  <distribution_quality>{:.3}</distribution_quality>",
391        result.distribution_quality
392    );
393    println!(
394        "  <power_law_fit>{:.3}</power_law_fit>",
395        result.power_law_fit
396    );
397    println!("</zipf_analysis>");
398}
399
400/// Analyze numbers with filtering and custom options
401fn analyze_numbers_with_options(
402    matches: &clap::ArgMatches,
403    dataset_name: String,
404    numbers: &[f64],
405) -> Result<ZipfResult> {
406    // Apply number filtering if specified
407    let filtered_numbers = if let Some(filter_str) = matches.get_one::<String>("filter") {
408        let filter = NumberFilter::parse(filter_str)
409            .map_err(|e| BenfError::ParseError(format!("無効なフィルタ: {e}")))?;
410
411        let filtered = apply_number_filter(numbers, &filter);
412
413        // Inform user about filtering results
414        if filtered.len() != numbers.len() {
415            eprintln!(
416                "フィルタリング結果: {} 個の数値が {} 個に絞り込まれました ({})",
417                numbers.len(),
418                filtered.len(),
419                filter.description()
420            );
421        }
422
423        filtered
424    } else {
425        numbers.to_vec()
426    };
427
428    // Parse minimum count requirement
429    let min_count = if let Some(min_count_str) = matches.get_one::<String>("min-count") {
430        min_count_str
431            .parse::<usize>()
432            .map_err(|_| BenfError::ParseError("無効な最小数値数".to_string()))?
433    } else {
434        5
435    };
436
437    // Check minimum count requirement
438    if filtered_numbers.len() < min_count {
439        return Err(BenfError::InsufficientData(filtered_numbers.len()));
440    }
441
442    // Perform Zipf analysis
443    analyze_numeric_zipf(&filtered_numbers, &dataset_name)
444}
445
446fn format_rank_frequency_chart(result: &ZipfResult) -> String {
447    let mut output = String::new();
448    const CHART_WIDTH: usize = 50;
449
450    if result.rank_frequency_pairs.is_empty() {
451        return "No data available for chart".to_string();
452    }
453
454    // 最大頻度を取得(正規化用)
455    let max_frequency = result
456        .rank_frequency_pairs
457        .iter()
458        .map(|(_, freq)| *freq)
459        .fold(0.0, f64::max);
460
461    if max_frequency == 0.0 {
462        return "All frequencies are zero".to_string();
463    }
464
465    // ランク-頻度ペアを表示(上位10項目)
466    for (rank, frequency) in result.rank_frequency_pairs.iter().take(10) {
467        let normalized_freq = frequency / max_frequency;
468        let bar_length = (normalized_freq * CHART_WIDTH as f64).round() as usize;
469        let bar_length = bar_length.min(CHART_WIDTH);
470
471        // Calculate expected value based on ideal Zipf law (1/rank)
472        let expected_freq = max_frequency / *rank as f64;
473        let expected_normalized = expected_freq / max_frequency;
474        let expected_line_pos = (expected_normalized * CHART_WIDTH as f64).round() as usize;
475        let expected_line_pos = expected_line_pos.min(CHART_WIDTH - 1);
476
477        // Create bar with filled portion, expected value line, and background
478        let mut bar_chars = Vec::new();
479        for pos in 0..CHART_WIDTH {
480            if pos == expected_line_pos {
481                bar_chars.push('┃'); // Expected value line (ideal Zipf)
482            } else if pos < bar_length {
483                bar_chars.push('█'); // Filled portion
484            } else {
485                bar_chars.push('░'); // Background portion
486            }
487        }
488        let full_bar: String = bar_chars.iter().collect();
489
490        // パーセンテージ計算
491        let percentage = (frequency / result.total_observations as f64) * 100.0;
492        let expected_percentage = (expected_freq / result.total_observations as f64) * 100.0;
493
494        output.push_str(&format!(
495            "#{rank:2}: {full_bar} {percentage:>6.2}% (expected: {expected_percentage:.2}%)\n"
496        ));
497    }
498
499    // Zipf法則の適合度情報
500    output.push_str(&format!(
501        "\nZipf Exponent: {:.3} (ideal: 1.0), Correlation: {:.3}",
502        result.zipf_exponent, result.correlation_coefficient
503    ));
504
505    output
506}