Skip to main content

lawkit_python/subcommands/
benf.rs

1use crate::colors;
2use clap::ArgMatches;
3use lawkit_core::{
4    common::{
5        filtering::{apply_number_filter, NumberFilter, RiskThreshold},
6        input::{parse_input_auto, parse_text_input},
7        memory::{streaming_benford_analysis, MemoryConfig},
8        risk::RiskLevel,
9        streaming_io::OptimizedFileReader,
10    },
11    error::{BenfError, Result},
12    laws::benford::BenfordResult,
13};
14use std::str::FromStr;
15
16pub fn run(matches: &ArgMatches) -> Result<()> {
17    // Determine input source based on arguments
18    if matches.get_flag("verbose") {
19        eprintln!(
20            "Debug: input argument = {:?}",
21            matches.get_one::<String>("input")
22        );
23    }
24
25    if let Some(input) = matches.get_one::<String>("input") {
26        // Use auto-detection for file vs string input
27        match parse_input_auto(input) {
28            Ok(numbers) => {
29                if numbers.is_empty() {
30                    eprintln!("Error: No valid numbers found in input");
31                    std::process::exit(1);
32                }
33
34                // Apply filtering and custom analysis
35                let result =
36                    match analyze_numbers_with_options(matches, input.to_string(), &numbers) {
37                        Ok(result) => result,
38                        Err(e) => {
39                            eprintln!("Analysis error: {e}");
40                            std::process::exit(1);
41                        }
42                    };
43
44                // Output results and exit
45                output_results(matches, &result);
46                std::process::exit(result.risk_level.exit_code());
47            }
48            Err(e) => {
49                eprintln!("Error processing input '{input}': {e}");
50                std::process::exit(1);
51            }
52        }
53    } else {
54        // Read from stdin - use automatic optimization based on data characteristics
55        if matches.get_flag("verbose") {
56            eprintln!("Debug: Reading from stdin, using automatic optimization");
57        }
58
59        // 自動最適化処理:データ特性に基づいてストリーミング処理を自動選択
60        let mut reader = OptimizedFileReader::from_stdin();
61
62        if matches.get_flag("verbose") {
63            eprintln!(
64                "Debug: Using automatic optimization (streaming + incremental + memory efficiency)"
65            );
66        }
67
68        // ストリーミング処理でインクリメンタル分析を実行
69        let numbers = match reader.read_lines_streaming(|line| {
70            if matches.get_flag("verbose") {
71                eprintln!("Debug: Processing line: '{line}'");
72            }
73            parse_text_input(&line).map(Some).or(Ok(None))
74        }) {
75            Ok(nested_numbers) => {
76                let flattened: Vec<f64> = nested_numbers.into_iter().flatten().collect();
77                if matches.get_flag("verbose") {
78                    eprintln!("Debug: Collected {} numbers from stream", flattened.len());
79                }
80                flattened
81            }
82            Err(e) => {
83                eprintln!("Analysis error: {e}");
84                std::process::exit(1);
85            }
86        };
87
88        // Apply minimum value filter for streaming analysis
89        let filtered_numbers = if let Some(min_value_str) = matches.get_one::<String>("min-value") {
90            let min_val = min_value_str
91                .parse::<f64>()
92                .map_err(|_| {
93                    eprintln!("Error: Invalid minimum value");
94                    std::process::exit(2);
95                })
96                .unwrap();
97
98            let original_len = numbers.len();
99            let filtered: Vec<f64> = numbers.into_iter().filter(|&x| x >= min_val).collect();
100
101            if matches.get_flag("verbose") {
102                eprintln!(
103                    "Debug: Min-value filter applied: {} → {} numbers (>= {})",
104                    original_len,
105                    filtered.len(),
106                    min_val
107                );
108                eprintln!(
109                    "Debug: Filter removed {} values ({:.1}%)",
110                    original_len - filtered.len(),
111                    100.0 * (original_len - filtered.len()) as f64 / original_len as f64
112                );
113            }
114            filtered
115        } else {
116            numbers
117        };
118
119        // メモリ設定を作成
120        let memory_config = MemoryConfig::default();
121
122        // インクリメンタルストリーミング分析を実行
123        let chunk_result =
124            match streaming_benford_analysis(filtered_numbers.into_iter(), &memory_config) {
125                Ok(result) => {
126                    if matches.get_flag("verbose") {
127                        eprintln!(
128                            "Debug: Streaming analysis successful - {} items processed",
129                            result.total_items
130                        );
131                    }
132                    result
133                }
134                Err(e) => {
135                    eprintln!("Streaming analysis error: {e}");
136                    std::process::exit(1);
137                }
138            };
139
140        if chunk_result.total_items == 0 {
141            if matches.get_flag("verbose") {
142                eprintln!(
143                    "Debug: Total items in chunk_result: {}",
144                    chunk_result.total_items
145                );
146            }
147            eprintln!("Error: No valid numbers found in input");
148            std::process::exit(1);
149        }
150
151        // IncrementalBenford を BenfordResult に変換
152        let benford_result =
153            convert_incremental_to_result(&chunk_result.result, "stdin".to_string(), matches);
154
155        // デバッグ情報を出力
156        if matches.get_flag("verbose") {
157            eprintln!(
158                "Debug: Processed {} numbers in {} chunks",
159                chunk_result.total_items, chunk_result.chunks_processed
160            );
161            eprintln!("Debug: Memory used: {:.2} MB", chunk_result.memory_used_mb);
162            eprintln!(
163                "Debug: Processing time: {} ms",
164                chunk_result.processing_time_ms
165            );
166        }
167
168        // Output results and exit
169        output_results(matches, &benford_result);
170        std::process::exit(benford_result.risk_level.exit_code());
171    }
172}
173
174fn output_results(matches: &clap::ArgMatches, result: &BenfordResult) {
175    let format = matches.get_one::<String>("format").unwrap();
176    let quiet = matches.get_flag("quiet");
177    let verbose = matches.get_flag("verbose");
178    let no_color = matches.get_flag("no-color");
179
180    match format.as_str() {
181        "text" => print_text_output(result, quiet, verbose, no_color),
182        "json" => print_json_output(result),
183        "csv" => print_csv_output(result),
184        "yaml" => print_yaml_output(result),
185        "toml" => print_toml_output(result),
186        "xml" => print_xml_output(result),
187        _ => {
188            eprintln!("Error: Unsupported output format: {format}");
189            std::process::exit(2);
190        }
191    }
192}
193
194fn print_text_output(result: &BenfordResult, quiet: bool, verbose: bool, no_color: bool) {
195    if quiet {
196        for (i, &observed) in result.digit_distribution.iter().enumerate() {
197            println!("{}: {:.1}%", i + 1, observed);
198        }
199        return;
200    }
201
202    println!("Benford Law Analysis Results");
203    println!();
204    println!("Dataset: {}", result.dataset_name);
205    println!("Numbers analyzed: {}", result.numbers_analyzed);
206    match result.risk_level {
207        RiskLevel::Critical => println!("{}", colors::level_critical("Dataset analysis", no_color)),
208        RiskLevel::High => println!("{}", colors::level_high("Dataset analysis", no_color)),
209        RiskLevel::Medium => println!("{}", colors::level_medium("Dataset analysis", no_color)),
210        RiskLevel::Low => println!("{}", colors::level_low("Dataset analysis", no_color)),
211    }
212
213    println!();
214    println!("First Digit Distribution:");
215    println!("{}", format_distribution_bars(result));
216
217    if verbose {
218        println!();
219        println!("First Digit Distribution:");
220        for (i, &observed) in result.digit_distribution.iter().enumerate() {
221            let digit = i + 1;
222            let expected = result.expected_distribution[i];
223            let deviation = observed - expected;
224
225            println!(
226                "{digit}: {observed:.1}% (expected: {expected:.1}%, deviation: {deviation:+.1}%)"
227            );
228        }
229
230        println!();
231        println!("Statistical Tests:");
232        println!(
233            "Chi-square: {:.2} (p-value: {:.6})",
234            result.chi_square, result.p_value
235        );
236    }
237}
238
239fn print_json_output(result: &BenfordResult) {
240    use serde_json::json;
241
242    let output = json!({
243        "dataset": result.dataset_name,
244        "numbers_analyzed": result.numbers_analyzed,
245        "risk_level": format!("{:?}", result.risk_level),
246        "chi_square": result.chi_square,
247        "p_value": result.p_value,
248        "mean_absolute_deviation": result.mean_absolute_deviation
249    });
250
251    println!("{}", serde_json::to_string_pretty(&output).unwrap());
252}
253
254fn print_csv_output(result: &BenfordResult) {
255    println!("dataset,numbers_analyzed,risk_level,chi_square,p_value,mad");
256    println!(
257        "{},{},{:?},{:.6},{:.6},{:.2}",
258        result.dataset_name,
259        result.numbers_analyzed,
260        result.risk_level,
261        result.chi_square,
262        result.p_value,
263        result.mean_absolute_deviation
264    );
265}
266
267fn print_yaml_output(result: &BenfordResult) {
268    println!("dataset: \"{}\"", result.dataset_name);
269    println!("numbers_analyzed: {}", result.numbers_analyzed);
270    println!("risk_level: \"{:?}\"", result.risk_level);
271    println!("chi_square: {:.6}", result.chi_square);
272    println!("p_value: {:.6}", result.p_value);
273    println!("mad: {:.2}", result.mean_absolute_deviation);
274}
275
276fn print_toml_output(result: &BenfordResult) {
277    println!("dataset = \"{}\"", result.dataset_name);
278    println!("numbers_analyzed = {}", result.numbers_analyzed);
279    println!("risk_level = \"{:?}\"", result.risk_level);
280    println!("chi_square = {:.6}", result.chi_square);
281    println!("p_value = {:.6}", result.p_value);
282    println!("mad = {:.2}", result.mean_absolute_deviation);
283}
284
285fn print_xml_output(result: &BenfordResult) {
286    println!("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
287    println!("<benford_analysis>");
288    println!("  <dataset>{}</dataset>", result.dataset_name);
289    println!(
290        "  <numbers_analyzed>{}</numbers_analyzed>",
291        result.numbers_analyzed
292    );
293    println!("  <risk_level>{:?}</risk_level>", result.risk_level);
294    println!("  <chi_square>{:.6}</chi_square>", result.chi_square);
295    println!("  <p_value>{:.6}</p_value>", result.p_value);
296    println!("  <mad>{:.2}</mad>", result.mean_absolute_deviation);
297    println!("</benford_analysis>");
298}
299
300/// Analyze numbers with filtering and custom options
301fn analyze_numbers_with_options(
302    matches: &clap::ArgMatches,
303    dataset_name: String,
304    numbers: &[f64],
305) -> Result<BenfordResult> {
306    // Apply number filtering if specified
307    let filtered_numbers = if let Some(filter_str) = matches.get_one::<String>("filter") {
308        let filter = NumberFilter::parse(filter_str)
309            .map_err(|e| BenfError::ParseError(format!("無効なフィルタ: {e}")))?;
310
311        let filtered = apply_number_filter(numbers, &filter);
312
313        // Inform user about filtering results
314        if filtered.len() != numbers.len() {
315            eprintln!(
316                "フィルタリング結果: {} 個の数値が {} 個に絞り込まれました ({})",
317                numbers.len(),
318                filtered.len(),
319                filter.description()
320            );
321        }
322
323        filtered
324    } else {
325        numbers.to_vec()
326    };
327
328    // Parse custom threshold if specified
329    let threshold = if let Some(threshold_str) = matches.get_one::<String>("threshold") {
330        if threshold_str == "auto" {
331            RiskThreshold::Auto
332        } else {
333            RiskThreshold::from_str(threshold_str)
334                .map_err(|e| BenfError::ParseError(format!("無効な閾値: {e}")))?
335        }
336    } else {
337        RiskThreshold::Auto
338    };
339
340    // Parse minimum count requirement
341    let min_count = if let Some(min_count_str) = matches.get_one::<String>("min-count") {
342        min_count_str
343            .parse::<usize>()
344            .map_err(|_| BenfError::ParseError("無効な最小数値数".to_string()))?
345    } else {
346        5
347    };
348
349    // Parse confidence level
350    let _confidence = if let Some(confidence_str) = matches.get_one::<String>("confidence") {
351        let conf = confidence_str
352            .parse::<f64>()
353            .map_err(|_| BenfError::ParseError("無効な信頼度レベル".to_string()))?;
354        if !(0.01..=0.99).contains(&conf) {
355            return Err(BenfError::ParseError(
356                "信頼度レベルは0.01から0.99の間である必要があります".to_string(),
357            ));
358        }
359        conf
360    } else {
361        0.95
362    };
363
364    // Parse sample size limit
365    let mut working_numbers = filtered_numbers.clone();
366    if let Some(sample_size_str) = matches.get_one::<String>("sample-size") {
367        let max_size = sample_size_str
368            .parse::<usize>()
369            .map_err(|_| BenfError::ParseError("無効なサンプルサイズ".to_string()))?;
370
371        if working_numbers.len() > max_size {
372            eprintln!(
373                "大規模データセット: {}個の数値を{}個にサンプリングしました",
374                working_numbers.len(),
375                max_size
376            );
377            // Simple random sampling by taking every nth element
378            let step = working_numbers.len() / max_size;
379            working_numbers = working_numbers
380                .iter()
381                .step_by(step.max(1))
382                .cloned()
383                .take(max_size)
384                .collect();
385        }
386    }
387
388    // Apply minimum value filter
389    if let Some(min_value_str) = matches.get_one::<String>("min-value") {
390        let min_val = min_value_str
391            .parse::<f64>()
392            .map_err(|_| BenfError::ParseError("無効な最小値".to_string()))?;
393
394        let original_len = working_numbers.len();
395        working_numbers.retain(|&x| x >= min_val);
396
397        if working_numbers.len() != original_len {
398            if matches.get_flag("verbose") {
399                eprintln!(
400                    "Debug: Min-value filter applied: {} → {} numbers (>= {})",
401                    original_len,
402                    working_numbers.len(),
403                    min_val
404                );
405                eprintln!(
406                    "Debug: Filter removed {} values ({:.1}%)",
407                    original_len - working_numbers.len(),
408                    100.0 * (original_len - working_numbers.len()) as f64 / original_len as f64
409                );
410            } else {
411                eprintln!(
412                    "最小値フィルタ適用: {}個の数値が{}個に絞り込まれました (>= {})",
413                    original_len,
414                    working_numbers.len(),
415                    min_val
416                );
417            }
418        }
419    }
420
421    // Perform Benford analysis with custom options
422    BenfordResult::new_with_threshold(dataset_name, &working_numbers, &threshold, min_count)
423}
424
425/// IncrementalBenford を BenfordResult に変換
426fn convert_incremental_to_result(
427    incremental: &lawkit_core::common::memory::IncrementalBenford,
428    dataset_name: String,
429    _matches: &clap::ArgMatches,
430) -> BenfordResult {
431    use lawkit_core::common::statistics;
432
433    // 分布データを取得
434    let digit_distribution = incremental.get_distribution();
435    let expected_distribution = [
436        30.103, 17.609, 12.494, 9.691, 7.918, 6.695, 5.799, 5.115, 4.576,
437    ];
438
439    // 統計値を計算
440    let chi_square = statistics::calculate_chi_square(&digit_distribution, &expected_distribution);
441    let p_value = statistics::calculate_p_value(chi_square, 8);
442    let mean_absolute_deviation = incremental.calculate_mad();
443
444    // リスクレベルを決定
445    let risk_level = determine_risk_level(mean_absolute_deviation, p_value);
446
447    // 判定を生成
448    let verdict = format!("Risk Level: {risk_level:?}");
449
450    BenfordResult {
451        dataset_name,
452        numbers_analyzed: incremental.total_count(),
453        digit_distribution,
454        expected_distribution,
455        chi_square,
456        p_value,
457        mean_absolute_deviation,
458        risk_level,
459        verdict,
460    }
461}
462
463/// リスクレベルを決定
464fn determine_risk_level(mad: f64, p_value: f64) -> RiskLevel {
465    if mad > 15.0 || p_value < 0.01 {
466        RiskLevel::Critical
467    } else if mad > 10.0 || p_value < 0.05 {
468        RiskLevel::High
469    } else if mad > 5.0 || p_value < 0.10 {
470        RiskLevel::Medium
471    } else {
472        RiskLevel::Low
473    }
474}
475
476fn format_distribution_bars(result: &BenfordResult) -> String {
477    let mut output = String::new();
478    const CHART_WIDTH: usize = 50;
479
480    for i in 0..9 {
481        let digit = i + 1;
482        let observed = result.digit_distribution[i];
483        let expected = result.expected_distribution[i];
484        let bar_length = ((observed / 100.0) * CHART_WIDTH as f64).round() as usize;
485        let bar_length = bar_length.min(CHART_WIDTH); // Ensure we don't exceed max width
486
487        // Calculate expected value line position
488        let expected_line_pos = ((expected / 100.0) * CHART_WIDTH as f64).round() as usize;
489        let expected_line_pos = expected_line_pos.min(CHART_WIDTH - 1); // Ensure it's within bounds
490
491        // Create bar with filled portion, expected value line, and background
492        let mut bar_chars = Vec::new();
493        for pos in 0..CHART_WIDTH {
494            if pos == expected_line_pos {
495                bar_chars.push('┃'); // Expected value line (always visible)
496            } else if pos < bar_length {
497                bar_chars.push('█'); // Filled portion
498            } else {
499                bar_chars.push('░'); // Background portion
500            }
501        }
502        let full_bar: String = bar_chars.iter().collect();
503
504        output.push_str(&format!(
505            "{digit:1}: {full_bar} {observed:>5.1}% (expected: {expected:>5.1}%)\n"
506        ));
507    }
508
509    output
510}