lawkit_python/subcommands/
benf.rs

1use crate::colors;
2use clap::ArgMatches;
3use lawkit_core::{
4    common::{
5        filtering::{apply_number_filter, NumberFilter, RiskThreshold},
6        input::{parse_input_auto, parse_text_input},
7        memory::{streaming_benford_analysis, MemoryConfig},
8        risk::RiskLevel,
9        streaming_io::OptimizedFileReader,
10    },
11    error::{BenfError, Result},
12    laws::benford::BenfordResult,
13};
14use std::str::FromStr;
15
16pub fn run(matches: &ArgMatches) -> Result<()> {
17    // Determine input source based on arguments
18    if matches.get_flag("verbose") {
19        eprintln!(
20            "Debug: input argument = {:?}",
21            matches.get_one::<String>("input")
22        );
23    }
24
25    if let Some(input) = matches.get_one::<String>("input") {
26        // Use auto-detection for file vs string input
27        match parse_input_auto(input) {
28            Ok(numbers) => {
29                if numbers.is_empty() {
30                    eprintln!("Error: No valid numbers found in input");
31                    std::process::exit(1);
32                }
33
34                // Apply filtering and custom analysis
35                let result =
36                    match analyze_numbers_with_options(matches, input.to_string(), &numbers) {
37                        Ok(result) => result,
38                        Err(e) => {
39                            eprintln!("Analysis error: {e}");
40                            std::process::exit(1);
41                        }
42                    };
43
44                // Output results and exit
45                output_results(matches, &result);
46                std::process::exit(result.risk_level.exit_code());
47            }
48            Err(e) => {
49                eprintln!("Error processing input '{input}': {e}");
50                std::process::exit(1);
51            }
52        }
53    } else {
54        // Read from stdin - use automatic optimization based on data characteristics
55        if matches.get_flag("verbose") {
56            eprintln!("Debug: Reading from stdin, using automatic optimization");
57        }
58
59        // 自動最適化処理:データ特性に基づいてストリーミング処理を自動選択
60        let mut reader = OptimizedFileReader::from_stdin();
61
62        if matches.get_flag("verbose") {
63            eprintln!(
64                "Debug: Using automatic optimization (streaming + incremental + memory efficiency)"
65            );
66        }
67
68        // ストリーミング処理でインクリメンタル分析を実行
69        let numbers = match reader.read_lines_streaming(|line| {
70            if matches.get_flag("verbose") {
71                eprintln!("Debug: Processing line: '{line}'");
72            }
73            parse_text_input(&line).map(Some).or(Ok(None))
74        }) {
75            Ok(nested_numbers) => {
76                let flattened: Vec<f64> = nested_numbers.into_iter().flatten().collect();
77                if matches.get_flag("verbose") {
78                    eprintln!("Debug: Collected {} numbers from stream", flattened.len());
79                }
80                flattened
81            }
82            Err(e) => {
83                eprintln!("Analysis error: {e}");
84                std::process::exit(1);
85            }
86        };
87
88        // Apply minimum value filter for streaming analysis
89        let filtered_numbers = if let Some(min_value_str) = matches.get_one::<String>("min-value") {
90            let min_val = min_value_str
91                .parse::<f64>()
92                .map_err(|_| {
93                    eprintln!("Error: Invalid minimum value");
94                    std::process::exit(2);
95                })
96                .unwrap();
97
98            let original_len = numbers.len();
99            let filtered: Vec<f64> = numbers.into_iter().filter(|&x| x >= min_val).collect();
100
101            if matches.get_flag("verbose") {
102                eprintln!(
103                    "Debug: Min-value filter applied: {} → {} numbers (>= {})",
104                    original_len,
105                    filtered.len(),
106                    min_val
107                );
108                eprintln!(
109                    "Debug: Filter removed {} values ({:.1}%)",
110                    original_len - filtered.len(),
111                    100.0 * (original_len - filtered.len()) as f64 / original_len as f64
112                );
113            }
114            filtered
115        } else {
116            numbers
117        };
118
119        // メモリ設定を作成
120        let memory_config = MemoryConfig::default();
121
122        // インクリメンタルストリーミング分析を実行
123        let chunk_result =
124            match streaming_benford_analysis(filtered_numbers.into_iter(), &memory_config) {
125                Ok(result) => {
126                    if matches.get_flag("verbose") {
127                        eprintln!(
128                            "Debug: Streaming analysis successful - {} items processed",
129                            result.total_items
130                        );
131                    }
132                    result
133                }
134                Err(e) => {
135                    eprintln!("Streaming analysis error: {e}");
136                    std::process::exit(1);
137                }
138            };
139
140        if chunk_result.total_items == 0 {
141            if matches.get_flag("verbose") {
142                eprintln!(
143                    "Debug: Total items in chunk_result: {}",
144                    chunk_result.total_items
145                );
146            }
147            eprintln!("Error: No valid numbers found in input");
148            std::process::exit(1);
149        }
150
151        // IncrementalBenford を BenfordResult に変換
152        let benford_result =
153            convert_incremental_to_result(&chunk_result.result, "stdin".to_string(), matches);
154
155        // デバッグ情報を出力
156        if matches.get_flag("verbose") {
157            eprintln!(
158                "Debug: Processed {} numbers in {} chunks",
159                chunk_result.total_items, chunk_result.chunks_processed
160            );
161            eprintln!("Debug: Memory used: {:.2} MB", chunk_result.memory_used_mb);
162            eprintln!(
163                "Debug: Processing time: {} ms",
164                chunk_result.processing_time_ms
165            );
166        }
167
168        // Output results and exit
169        output_results(matches, &benford_result);
170        std::process::exit(benford_result.risk_level.exit_code());
171    }
172}
173
174fn output_results(matches: &clap::ArgMatches, result: &BenfordResult) {
175    let format = matches.get_one::<String>("format").unwrap();
176    let quiet = matches.get_flag("quiet");
177    let verbose = matches.get_flag("verbose");
178
179    match format.as_str() {
180        "text" => print_text_output(result, quiet, verbose),
181        "json" => print_json_output(result),
182        "csv" => print_csv_output(result),
183        "yaml" => print_yaml_output(result),
184        "toml" => print_toml_output(result),
185        "xml" => print_xml_output(result),
186        _ => {
187            eprintln!("Error: Unsupported output format: {format}");
188            std::process::exit(2);
189        }
190    }
191}
192
193fn print_text_output(result: &BenfordResult, quiet: bool, verbose: bool) {
194    if quiet {
195        for (i, &observed) in result.digit_distribution.iter().enumerate() {
196            println!("{}: {:.1}%", i + 1, observed);
197        }
198        return;
199    }
200
201    println!("Benford Law Analysis Results");
202    println!();
203    println!("Dataset: {}", result.dataset_name);
204    println!("Numbers analyzed: {}", result.numbers_analyzed);
205    match result.risk_level {
206        RiskLevel::Critical => println!("{}", colors::level_critical("Dataset analysis")),
207        RiskLevel::High => println!("{}", colors::level_high("Dataset analysis")),
208        RiskLevel::Medium => println!("{}", colors::level_medium("Dataset analysis")),
209        RiskLevel::Low => println!("{}", colors::level_low("Dataset analysis")),
210    }
211
212    println!();
213    println!("First Digit Distribution:");
214    println!("{}", format_distribution_bars(result));
215
216    if verbose {
217        println!();
218        println!("First Digit Distribution:");
219        for (i, &observed) in result.digit_distribution.iter().enumerate() {
220            let digit = i + 1;
221            let expected = result.expected_distribution[i];
222            let deviation = observed - expected;
223
224            println!(
225                "{digit}: {observed:.1}% (expected: {expected:.1}%, deviation: {deviation:+.1}%)"
226            );
227        }
228
229        println!();
230        println!("Statistical Tests:");
231        println!(
232            "Chi-square: {:.2} (p-value: {:.6})",
233            result.chi_square, result.p_value
234        );
235    }
236}
237
238fn print_json_output(result: &BenfordResult) {
239    use serde_json::json;
240
241    let output = json!({
242        "dataset": result.dataset_name,
243        "numbers_analyzed": result.numbers_analyzed,
244        "risk_level": format!("{:?}", result.risk_level),
245        "chi_square": result.chi_square,
246        "p_value": result.p_value,
247        "mean_absolute_deviation": result.mean_absolute_deviation
248    });
249
250    println!("{}", serde_json::to_string_pretty(&output).unwrap());
251}
252
253fn print_csv_output(result: &BenfordResult) {
254    println!("dataset,numbers_analyzed,risk_level,chi_square,p_value,mad");
255    println!(
256        "{},{},{:?},{:.6},{:.6},{:.2}",
257        result.dataset_name,
258        result.numbers_analyzed,
259        result.risk_level,
260        result.chi_square,
261        result.p_value,
262        result.mean_absolute_deviation
263    );
264}
265
266fn print_yaml_output(result: &BenfordResult) {
267    println!("dataset: \"{}\"", result.dataset_name);
268    println!("numbers_analyzed: {}", result.numbers_analyzed);
269    println!("risk_level: \"{:?}\"", result.risk_level);
270    println!("chi_square: {:.6}", result.chi_square);
271    println!("p_value: {:.6}", result.p_value);
272    println!("mad: {:.2}", result.mean_absolute_deviation);
273}
274
275fn print_toml_output(result: &BenfordResult) {
276    println!("dataset = \"{}\"", result.dataset_name);
277    println!("numbers_analyzed = {}", result.numbers_analyzed);
278    println!("risk_level = \"{:?}\"", result.risk_level);
279    println!("chi_square = {:.6}", result.chi_square);
280    println!("p_value = {:.6}", result.p_value);
281    println!("mad = {:.2}", result.mean_absolute_deviation);
282}
283
284fn print_xml_output(result: &BenfordResult) {
285    println!("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
286    println!("<benford_analysis>");
287    println!("  <dataset>{}</dataset>", result.dataset_name);
288    println!(
289        "  <numbers_analyzed>{}</numbers_analyzed>",
290        result.numbers_analyzed
291    );
292    println!("  <risk_level>{:?}</risk_level>", result.risk_level);
293    println!("  <chi_square>{:.6}</chi_square>", result.chi_square);
294    println!("  <p_value>{:.6}</p_value>", result.p_value);
295    println!("  <mad>{:.2}</mad>", result.mean_absolute_deviation);
296    println!("</benford_analysis>");
297}
298
299/// Analyze numbers with filtering and custom options
300fn analyze_numbers_with_options(
301    matches: &clap::ArgMatches,
302    dataset_name: String,
303    numbers: &[f64],
304) -> Result<BenfordResult> {
305    // Apply number filtering if specified
306    let filtered_numbers = if let Some(filter_str) = matches.get_one::<String>("filter") {
307        let filter = NumberFilter::parse(filter_str)
308            .map_err(|e| BenfError::ParseError(format!("無効なフィルタ: {e}")))?;
309
310        let filtered = apply_number_filter(numbers, &filter);
311
312        // Inform user about filtering results
313        if filtered.len() != numbers.len() {
314            eprintln!(
315                "フィルタリング結果: {} 個の数値が {} 個に絞り込まれました ({})",
316                numbers.len(),
317                filtered.len(),
318                filter.description()
319            );
320        }
321
322        filtered
323    } else {
324        numbers.to_vec()
325    };
326
327    // Parse custom threshold if specified
328    let threshold = if let Some(threshold_str) = matches.get_one::<String>("threshold") {
329        if threshold_str == "auto" {
330            RiskThreshold::Auto
331        } else {
332            RiskThreshold::from_str(threshold_str)
333                .map_err(|e| BenfError::ParseError(format!("無効な閾値: {e}")))?
334        }
335    } else {
336        RiskThreshold::Auto
337    };
338
339    // Parse minimum count requirement
340    let min_count = if let Some(min_count_str) = matches.get_one::<String>("min-count") {
341        min_count_str
342            .parse::<usize>()
343            .map_err(|_| BenfError::ParseError("無効な最小数値数".to_string()))?
344    } else {
345        5
346    };
347
348    // Parse confidence level
349    let _confidence = if let Some(confidence_str) = matches.get_one::<String>("confidence") {
350        let conf = confidence_str
351            .parse::<f64>()
352            .map_err(|_| BenfError::ParseError("無効な信頼度レベル".to_string()))?;
353        if !(0.01..=0.99).contains(&conf) {
354            return Err(BenfError::ParseError(
355                "信頼度レベルは0.01から0.99の間である必要があります".to_string(),
356            ));
357        }
358        conf
359    } else {
360        0.95
361    };
362
363    // Parse sample size limit
364    let mut working_numbers = filtered_numbers.clone();
365    if let Some(sample_size_str) = matches.get_one::<String>("sample-size") {
366        let max_size = sample_size_str
367            .parse::<usize>()
368            .map_err(|_| BenfError::ParseError("無効なサンプルサイズ".to_string()))?;
369
370        if working_numbers.len() > max_size {
371            eprintln!(
372                "大規模データセット: {}個の数値を{}個にサンプリングしました",
373                working_numbers.len(),
374                max_size
375            );
376            // Simple random sampling by taking every nth element
377            let step = working_numbers.len() / max_size;
378            working_numbers = working_numbers
379                .iter()
380                .step_by(step.max(1))
381                .cloned()
382                .take(max_size)
383                .collect();
384        }
385    }
386
387    // Apply minimum value filter
388    if let Some(min_value_str) = matches.get_one::<String>("min-value") {
389        let min_val = min_value_str
390            .parse::<f64>()
391            .map_err(|_| BenfError::ParseError("無効な最小値".to_string()))?;
392
393        let original_len = working_numbers.len();
394        working_numbers.retain(|&x| x >= min_val);
395
396        if working_numbers.len() != original_len {
397            if matches.get_flag("verbose") {
398                eprintln!(
399                    "Debug: Min-value filter applied: {} → {} numbers (>= {})",
400                    original_len,
401                    working_numbers.len(),
402                    min_val
403                );
404                eprintln!(
405                    "Debug: Filter removed {} values ({:.1}%)",
406                    original_len - working_numbers.len(),
407                    100.0 * (original_len - working_numbers.len()) as f64 / original_len as f64
408                );
409            } else {
410                eprintln!(
411                    "最小値フィルタ適用: {}個の数値が{}個に絞り込まれました (>= {})",
412                    original_len,
413                    working_numbers.len(),
414                    min_val
415                );
416            }
417        }
418    }
419
420    // Perform Benford analysis with custom options
421    BenfordResult::new_with_threshold(dataset_name, &working_numbers, &threshold, min_count)
422}
423
424/// IncrementalBenford を BenfordResult に変換
425fn convert_incremental_to_result(
426    incremental: &lawkit_core::common::memory::IncrementalBenford,
427    dataset_name: String,
428    _matches: &clap::ArgMatches,
429) -> BenfordResult {
430    use lawkit_core::common::statistics;
431
432    // 分布データを取得
433    let digit_distribution = incremental.get_distribution();
434    let expected_distribution = [
435        30.103, 17.609, 12.494, 9.691, 7.918, 6.695, 5.799, 5.115, 4.576,
436    ];
437
438    // 統計値を計算
439    let chi_square = statistics::calculate_chi_square(&digit_distribution, &expected_distribution);
440    let p_value = statistics::calculate_p_value(chi_square, 8);
441    let mean_absolute_deviation = incremental.calculate_mad();
442
443    // リスクレベルを決定
444    let risk_level = determine_risk_level(mean_absolute_deviation, p_value);
445
446    // 判定を生成
447    let verdict = format!("Risk Level: {risk_level:?}");
448
449    BenfordResult {
450        dataset_name,
451        numbers_analyzed: incremental.total_count(),
452        digit_distribution,
453        expected_distribution,
454        chi_square,
455        p_value,
456        mean_absolute_deviation,
457        risk_level,
458        verdict,
459    }
460}
461
462/// リスクレベルを決定
463fn determine_risk_level(mad: f64, p_value: f64) -> RiskLevel {
464    if mad > 15.0 || p_value < 0.01 {
465        RiskLevel::Critical
466    } else if mad > 10.0 || p_value < 0.05 {
467        RiskLevel::High
468    } else if mad > 5.0 || p_value < 0.10 {
469        RiskLevel::Medium
470    } else {
471        RiskLevel::Low
472    }
473}
474
475fn format_distribution_bars(result: &BenfordResult) -> String {
476    let mut output = String::new();
477    const CHART_WIDTH: usize = 50;
478
479    for i in 0..9 {
480        let digit = i + 1;
481        let observed = result.digit_distribution[i];
482        let expected = result.expected_distribution[i];
483        let bar_length = ((observed / 100.0) * CHART_WIDTH as f64).round() as usize;
484        let bar_length = bar_length.min(CHART_WIDTH); // Ensure we don't exceed max width
485
486        // Calculate expected value line position
487        let expected_line_pos = ((expected / 100.0) * CHART_WIDTH as f64).round() as usize;
488        let expected_line_pos = expected_line_pos.min(CHART_WIDTH - 1); // Ensure it's within bounds
489
490        // Create bar with filled portion, expected value line, and background
491        let mut bar_chars = Vec::new();
492        for pos in 0..CHART_WIDTH {
493            if pos == expected_line_pos {
494                bar_chars.push('┃'); // Expected value line (always visible)
495            } else if pos < bar_length {
496                bar_chars.push('█'); // Filled portion
497            } else {
498                bar_chars.push('░'); // Background portion
499            }
500        }
501        let full_bar: String = bar_chars.iter().collect();
502
503        output.push_str(&format!(
504            "{digit:1}: {full_bar} {observed:>5.1}% (expected: {expected:>5.1}%)\n"
505        ));
506    }
507
508    output
509}