Skip to main content

lawkit_python/subcommands/
zipf.rs

1use crate::colors;
2use crate::common_options::get_optimized_reader;
3use clap::ArgMatches;
4use lawkit_core::{
5    common::{
6        filtering::{apply_number_filter, NumberFilter},
7        input::{parse_input_auto, parse_text_input},
8        memory::{streaming_zipf_analysis, MemoryConfig},
9        risk::RiskLevel,
10        streaming_io::OptimizedFileReader,
11    },
12    error::{BenfError, Result},
13    laws::zipf::{
14        analyze_numeric_zipf, analyze_text_zipf, analyze_text_zipf_from_frequencies, ZipfResult,
15    },
16};
17
18pub fn run(matches: &ArgMatches) -> Result<()> {
19    let is_text_mode = matches.get_flag("text");
20
21    // Determine input source based on arguments
22    if matches.get_flag("verbose") {
23        eprintln!(
24            "Debug: input argument = {:?}",
25            matches.get_one::<String>("input")
26        );
27        eprintln!("Debug: text mode = {is_text_mode}");
28    }
29
30    if let Some(input) = matches.get_one::<String>("input") {
31        // Use auto-detection for file vs string input
32        if is_text_mode {
33            // Text mode: read file or use as text directly
34            let buffer = if input == "-" {
35                match get_optimized_reader(None) {
36                    Ok(data) => data,
37                    Err(e) => {
38                        eprintln!("Error reading input: {e}");
39                        std::process::exit(1);
40                    }
41                }
42            } else {
43                match get_optimized_reader(Some(input)) {
44                    Ok(data) => data,
45                    Err(e) => {
46                        eprintln!("Error reading input: {e}");
47                        std::process::exit(1);
48                    }
49                }
50            };
51
52            match analyze_text_zipf(&buffer, input) {
53                Ok(result) => {
54                    output_results(matches, &result);
55                    std::process::exit(result.risk_level.exit_code());
56                }
57                Err(e) => {
58                    eprintln!("Analysis error: {e}");
59                    std::process::exit(1);
60                }
61            }
62        } else {
63            // Numeric mode
64            match parse_input_auto(input) {
65                Ok(numbers) => {
66                    if numbers.is_empty() {
67                        eprintln!("Error: No valid numbers found in input");
68                        std::process::exit(1);
69                    }
70
71                    let result =
72                        match analyze_numbers_with_options(matches, input.to_string(), &numbers) {
73                            Ok(result) => result,
74                            Err(e) => {
75                                eprintln!("Analysis error: {e}");
76                                std::process::exit(1);
77                            }
78                        };
79
80                    output_results(matches, &result);
81                    std::process::exit(result.risk_level.exit_code());
82                }
83                Err(e) => {
84                    eprintln!("Error processing input '{input}': {e}");
85                    std::process::exit(1);
86                }
87            }
88        }
89    } else {
90        // Read from stdin - use automatic optimization based on data characteristics
91        if matches.get_flag("verbose") {
92            eprintln!("Debug: Reading from stdin, using automatic optimization");
93            eprintln!(
94                "Debug: Using automatic optimization (streaming + incremental + memory efficiency)"
95            );
96        }
97
98        if is_text_mode {
99            // Text mode with streaming
100            let mut reader = OptimizedFileReader::from_stdin();
101            let memory_config = MemoryConfig::default();
102
103            // Process text line by line and extract words
104            let mut words = Vec::new();
105            match reader.read_lines_streaming(|line: String| {
106                // Extract words from line
107                let line_words: Vec<String> =
108                    line.split_whitespace().map(|s| s.to_string()).collect();
109                words.extend(line_words);
110                Ok(None::<()>)
111            }) {
112                Ok(_) => {}
113                Err(e) => {
114                    eprintln!("Error reading stream: {e}");
115                    std::process::exit(1);
116                }
117            }
118
119            if matches.get_flag("verbose") {
120                eprintln!("Debug: Collected {} words from stream", words.len());
121            }
122
123            // Use streaming analysis
124            let chunk_result = match streaming_zipf_analysis(words.into_iter(), &memory_config) {
125                Ok(result) => {
126                    if matches.get_flag("verbose") {
127                        eprintln!(
128                            "Debug: Streaming analysis successful - {} items processed",
129                            result.total_items
130                        );
131                    }
132                    result
133                }
134                Err(e) => {
135                    eprintln!("Streaming analysis error: {e}");
136                    std::process::exit(1);
137                }
138            };
139
140            if matches.get_flag("verbose") {
141                eprintln!(
142                    "Debug: Processed {} items in {} chunks",
143                    chunk_result.total_items, chunk_result.chunks_processed
144                );
145                eprintln!("Debug: Memory used: {:.2} MB", chunk_result.memory_used_mb);
146                eprintln!(
147                    "Debug: Processing time: {} ms",
148                    chunk_result.processing_time_ms
149                );
150            }
151
152            // Convert IncrementalZipf to ZipfResult
153            let frequencies = chunk_result.result.get_sorted_frequencies();
154            let result = match analyze_text_zipf_from_frequencies(&frequencies, "stdin") {
155                Ok(result) => result,
156                Err(e) => {
157                    eprintln!("Analysis error: {e}");
158                    std::process::exit(1);
159                }
160            };
161
162            output_results(matches, &result);
163            std::process::exit(result.risk_level.exit_code());
164        } else {
165            // Numeric mode
166            let buffer = match get_optimized_reader(None) {
167                Ok(data) => data,
168                Err(e) => {
169                    eprintln!("Error reading input: {e}");
170                    std::process::exit(1);
171                }
172            };
173            let numbers = match parse_text_input(&buffer) {
174                Ok(numbers) => {
175                    if matches.get_flag("verbose") {
176                        eprintln!("Debug: Collected {} numbers from input", numbers.len());
177                    }
178                    numbers
179                }
180                Err(e) => {
181                    eprintln!("Analysis error: {e}");
182                    std::process::exit(1);
183                }
184            };
185
186            if numbers.is_empty() {
187                eprintln!("Error: No valid numbers found in input");
188                std::process::exit(1);
189            }
190
191            let result = match analyze_numbers_with_options(matches, "stdin".to_string(), &numbers)
192            {
193                Ok(result) => result,
194                Err(e) => {
195                    eprintln!("Analysis error: {e}");
196                    std::process::exit(1);
197                }
198            };
199
200            output_results(matches, &result);
201            std::process::exit(result.risk_level.exit_code());
202        }
203    }
204}
205
206fn output_results(matches: &clap::ArgMatches, result: &ZipfResult) {
207    let format = matches.get_one::<String>("format").unwrap();
208    let quiet = matches.get_flag("quiet");
209    let verbose = matches.get_flag("verbose");
210    let no_color = matches.get_flag("no-color");
211
212    match format.as_str() {
213        "text" => print_text_output(result, quiet, verbose, no_color),
214        "json" => print_json_output(result),
215        "csv" => print_csv_output(result),
216        "yaml" => print_yaml_output(result),
217        "toml" => print_toml_output(result),
218        "xml" => print_xml_output(result),
219        _ => {
220            eprintln!("Error: Unsupported output format: {format}");
221            std::process::exit(2);
222        }
223    }
224}
225
226fn print_text_output(result: &ZipfResult, quiet: bool, verbose: bool, no_color: bool) {
227    if quiet {
228        println!("zipf_exponent: {:.3}", result.zipf_exponent);
229        println!("correlation: {:.3}", result.correlation_coefficient);
230        println!("distribution_quality: {:.3}", result.distribution_quality);
231        return;
232    }
233
234    println!("Zipf Law Analysis Results");
235    println!();
236    println!("Dataset: {}", result.dataset_name);
237    println!("Numbers analyzed: {}", result.numbers_analyzed);
238    match result.risk_level {
239        RiskLevel::Critical => println!("{}", colors::level_critical("Dataset analysis", no_color)),
240        RiskLevel::High => println!("{}", colors::level_high("Dataset analysis", no_color)),
241        RiskLevel::Medium => println!("{}", colors::level_medium("Dataset analysis", no_color)),
242        RiskLevel::Low => println!("{}", colors::level_low("Dataset analysis", no_color)),
243    }
244
245    println!();
246    println!("Rank-Frequency Distribution:");
247    println!("{}", format_rank_frequency_chart(result));
248
249    if verbose {
250        println!();
251        println!("Zipf Metrics:");
252        println!("  Zipf exponent: {:.3}", result.zipf_exponent);
253        println!(
254            "  Correlation coefficient: {:.3}",
255            result.correlation_coefficient
256        );
257        println!("  Distribution quality: {:.3}", result.distribution_quality);
258        println!("  Power law fit: {:.3}", result.power_law_fit);
259
260        println!();
261        println!("Distribution Statistics:");
262        println!("  Total observations: {}", result.total_observations);
263        println!("  Unique items: {}", result.unique_items);
264        println!("  Top item frequency: {:.1}%", result.top_item_frequency);
265        println!("  Concentration index: {:.3}", result.concentration_index);
266        println!("  Diversity index (Shannon): {:.3}", result.diversity_index);
267
268        println!();
269        println!("Interpretation:");
270        print_zipf_interpretation(result);
271    }
272}
273
274fn print_zipf_interpretation(result: &ZipfResult) {
275    use lawkit_core::common::risk::RiskLevel;
276
277    match result.risk_level {
278        RiskLevel::Low => {
279            println!("[PASS] Ideal Zipf distribution - follows Zipf's law");
280            println!("   Distribution follows the expected 1/rank pattern");
281        }
282        RiskLevel::Medium => {
283            println!("[WARN] Slight deviation from Zipf's law");
284            println!("   Monitoring recommended for distribution pattern");
285        }
286        RiskLevel::High => {
287            println!("[FAIL] Significant deviation from Zipf's law");
288            println!("   Consider rebalancing distribution");
289        }
290        RiskLevel::Critical => {
291            println!("[CRITICAL] Critical deviation from Zipf's law");
292            println!("   Distribution strategy review needed");
293        }
294    }
295
296    // Zipf指数に基づく解釈
297    if result.zipf_exponent > 1.5 {
298        println!("   INFO: High concentration - extreme dominance pattern");
299    } else if result.zipf_exponent < 0.5 {
300        println!("   INFO: Low concentration - more uniform distribution");
301    }
302
303    // 相関係数に基づく解釈
304    if result.correlation_coefficient < 0.5 {
305        println!("   ALERT: Poor fit to Zipf's law - irregular distribution");
306    } else if result.correlation_coefficient > 0.8 {
307        println!("   INFO: Excellent fit to Zipf's law");
308    }
309}
310
311fn print_json_output(result: &ZipfResult) {
312    use serde_json::json;
313
314    let output = json!({
315        "dataset": result.dataset_name,
316        "numbers_analyzed": result.numbers_analyzed,
317        "risk_level": format!("{:?}", result.risk_level),
318        "zipf_exponent": result.zipf_exponent,
319        "correlation_coefficient": result.correlation_coefficient,
320        "distribution_quality": result.distribution_quality,
321        "total_observations": result.total_observations,
322        "unique_items": result.unique_items,
323        "top_item_frequency": result.top_item_frequency,
324        "concentration_index": result.concentration_index,
325        "diversity_index": result.diversity_index,
326        "power_law_fit": result.power_law_fit,
327        "rank_frequency_pairs": result.rank_frequency_pairs
328    });
329
330    println!("{}", serde_json::to_string_pretty(&output).unwrap());
331}
332
333fn print_csv_output(result: &ZipfResult) {
334    println!("dataset,numbers_analyzed,risk_level,zipf_exponent,correlation_coefficient,distribution_quality,power_law_fit");
335    println!(
336        "{},{},{:?},{:.3},{:.3},{:.3},{:.3}",
337        result.dataset_name,
338        result.numbers_analyzed,
339        result.risk_level,
340        result.zipf_exponent,
341        result.correlation_coefficient,
342        result.distribution_quality,
343        result.power_law_fit
344    );
345}
346
347fn print_yaml_output(result: &ZipfResult) {
348    println!("dataset: \"{}\"", result.dataset_name);
349    println!("numbers_analyzed: {}", result.numbers_analyzed);
350    println!("risk_level: \"{:?}\"", result.risk_level);
351    println!("zipf_exponent: {:.3}", result.zipf_exponent);
352    println!(
353        "correlation_coefficient: {:.3}",
354        result.correlation_coefficient
355    );
356    println!("distribution_quality: {:.3}", result.distribution_quality);
357    println!("power_law_fit: {:.3}", result.power_law_fit);
358}
359
360fn print_toml_output(result: &ZipfResult) {
361    println!("dataset = \"{}\"", result.dataset_name);
362    println!("numbers_analyzed = {}", result.numbers_analyzed);
363    println!("risk_level = \"{:?}\"", result.risk_level);
364    println!("zipf_exponent = {:.3}", result.zipf_exponent);
365    println!(
366        "correlation_coefficient = {:.3}",
367        result.correlation_coefficient
368    );
369    println!("distribution_quality = {:.3}", result.distribution_quality);
370    println!("power_law_fit = {:.3}", result.power_law_fit);
371}
372
373fn print_xml_output(result: &ZipfResult) {
374    println!("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
375    println!("<zipf_analysis>");
376    println!("  <dataset>{}</dataset>", result.dataset_name);
377    println!(
378        "  <numbers_analyzed>{}</numbers_analyzed>",
379        result.numbers_analyzed
380    );
381    println!("  <risk_level>{:?}</risk_level>", result.risk_level);
382    println!(
383        "  <zipf_exponent>{:.3}</zipf_exponent>",
384        result.zipf_exponent
385    );
386    println!(
387        "  <correlation_coefficient>{:.3}</correlation_coefficient>",
388        result.correlation_coefficient
389    );
390    println!(
391        "  <distribution_quality>{:.3}</distribution_quality>",
392        result.distribution_quality
393    );
394    println!(
395        "  <power_law_fit>{:.3}</power_law_fit>",
396        result.power_law_fit
397    );
398    println!("</zipf_analysis>");
399}
400
401/// Analyze numbers with filtering and custom options
402fn analyze_numbers_with_options(
403    matches: &clap::ArgMatches,
404    dataset_name: String,
405    numbers: &[f64],
406) -> Result<ZipfResult> {
407    // Apply number filtering if specified
408    let filtered_numbers = if let Some(filter_str) = matches.get_one::<String>("filter") {
409        let filter = NumberFilter::parse(filter_str)
410            .map_err(|e| BenfError::ParseError(format!("無効なフィルタ: {e}")))?;
411
412        let filtered = apply_number_filter(numbers, &filter);
413
414        // Inform user about filtering results
415        if filtered.len() != numbers.len() {
416            eprintln!(
417                "フィルタリング結果: {} 個の数値が {} 個に絞り込まれました ({})",
418                numbers.len(),
419                filtered.len(),
420                filter.description()
421            );
422        }
423
424        filtered
425    } else {
426        numbers.to_vec()
427    };
428
429    // Parse minimum count requirement
430    let min_count = if let Some(min_count_str) = matches.get_one::<String>("min-count") {
431        min_count_str
432            .parse::<usize>()
433            .map_err(|_| BenfError::ParseError("無効な最小数値数".to_string()))?
434    } else {
435        5
436    };
437
438    // Check minimum count requirement
439    if filtered_numbers.len() < min_count {
440        return Err(BenfError::InsufficientData(filtered_numbers.len()));
441    }
442
443    // Perform Zipf analysis
444    analyze_numeric_zipf(&filtered_numbers, &dataset_name)
445}
446
447fn format_rank_frequency_chart(result: &ZipfResult) -> String {
448    let mut output = String::new();
449    const CHART_WIDTH: usize = 50;
450
451    if result.rank_frequency_pairs.is_empty() {
452        return "No data available for chart".to_string();
453    }
454
455    // 最大頻度を取得(正規化用)
456    let max_frequency = result
457        .rank_frequency_pairs
458        .iter()
459        .map(|(_, freq)| *freq)
460        .fold(0.0, f64::max);
461
462    if max_frequency == 0.0 {
463        return "All frequencies are zero".to_string();
464    }
465
466    // ランク-頻度ペアを表示(上位10項目)
467    for (rank, frequency) in result.rank_frequency_pairs.iter().take(10) {
468        let normalized_freq = frequency / max_frequency;
469        let bar_length = (normalized_freq * CHART_WIDTH as f64).round() as usize;
470        let bar_length = bar_length.min(CHART_WIDTH);
471
472        // Calculate expected value based on ideal Zipf law (1/rank)
473        let expected_freq = max_frequency / *rank as f64;
474        let expected_normalized = expected_freq / max_frequency;
475        let expected_line_pos = (expected_normalized * CHART_WIDTH as f64).round() as usize;
476        let expected_line_pos = expected_line_pos.min(CHART_WIDTH - 1);
477
478        // Create bar with filled portion, expected value line, and background
479        let mut bar_chars = Vec::new();
480        for pos in 0..CHART_WIDTH {
481            if pos == expected_line_pos {
482                bar_chars.push('┃'); // Expected value line (ideal Zipf)
483            } else if pos < bar_length {
484                bar_chars.push('█'); // Filled portion
485            } else {
486                bar_chars.push('░'); // Background portion
487            }
488        }
489        let full_bar: String = bar_chars.iter().collect();
490
491        // パーセンテージ計算
492        let percentage = (frequency / result.total_observations as f64) * 100.0;
493        let expected_percentage = (expected_freq / result.total_observations as f64) * 100.0;
494
495        output.push_str(&format!(
496            "#{rank:2}: {full_bar} {percentage:>6.2}% (expected: {expected_percentage:.2}%)\n"
497        ));
498    }
499
500    // Zipf法則の適合度情報
501    output.push_str(&format!(
502        "\nZipf Exponent: {:.3} (ideal: 1.0), Correlation: {:.3}",
503        result.zipf_exponent, result.correlation_coefficient
504    ));
505
506    output
507}