1use anyhow::{anyhow, Result};
2use serde_json::Value;
3use std::collections::HashMap;
4
5pub use crate::types::*;
11
12pub fn law(
21 subcommand: &str,
22 data_or_config: &Value,
23 options: Option<&LawkitOptions>,
24) -> Result<Vec<LawkitResult>> {
25 let default_options = LawkitOptions::default();
26 let opts = options.unwrap_or(&default_options);
27
28 match subcommand {
29 "benf" | "benford" => analyze_benford_law(data_or_config, opts),
30 "pareto" => analyze_pareto_principle(data_or_config, opts),
31 "zipf" => analyze_zipf_law(data_or_config, opts),
32 "normal" => analyze_normal_distribution(data_or_config, opts),
33 "poisson" => analyze_poisson_distribution(data_or_config, opts),
34 "analyze" => analyze_all_laws(data_or_config, opts),
35 "validate" => validate_data(data_or_config, opts),
36 "diagnose" => diagnose_data(data_or_config, opts),
37 "generate" => generate_sample_data(data_or_config, opts),
38 _ => Err(anyhow!("Unknown subcommand: {}", subcommand)),
39 }
40}
41
42fn analyze_benford_law(data: &Value, _options: &LawkitOptions) -> Result<Vec<LawkitResult>> {
43 let numbers = extract_numbers_from_value(data)?;
44
45 if numbers.is_empty() {
46 return Err(anyhow!("No valid numbers found in input data"));
47 }
48
49 let mut observed_counts = [0.0; 9];
51 let mut total = 0;
52
53 for &num in &numbers {
54 if let Some(digit) = get_first_digit(num.abs()) {
55 observed_counts[digit as usize - 1] += 1.0;
56 total += 1;
57 }
58 }
59
60 let expected_proportions = [
62 2.0_f64.log10(), (1.0_f64 + 1.0 / 2.0).log10(), (1.0_f64 + 1.0 / 3.0).log10(), (1.0_f64 + 1.0 / 4.0).log10(), (1.0_f64 + 1.0 / 5.0).log10(), (1.0_f64 + 1.0 / 6.0).log10(), (1.0_f64 + 1.0 / 7.0).log10(), (1.0_f64 + 1.0 / 8.0).log10(), (1.0_f64 + 1.0 / 9.0).log10(), ];
72
73 let total_f = total as f64;
75 let expected_counts: [f64; 9] = expected_proportions.map(|p| p * total_f);
76
77 let observed: [f64; 9] = observed_counts.map(|c| c / total_f);
79
80 let chi_square = calculate_chi_square(&observed_counts, &expected_counts);
82 let p_value = calculate_p_value(chi_square, 8);
83 let mad = calculate_mad(&observed, &expected_proportions);
84
85 let risk_level = if p_value < 0.05 {
87 "HIGH"
88 } else if p_value < 0.1 {
89 "MEDIUM"
90 } else {
91 "LOW"
92 }
93 .to_string();
94
95 let analysis_summary =
96 format!("Benford's law analysis: p-value={p_value:.4}, MAD={mad:.4}, risk={risk_level}");
97
98 let benford_data = BenfordData {
99 observed_distribution: observed,
100 expected_distribution: expected_proportions,
101 chi_square,
102 p_value,
103 mad,
104 risk_level,
105 total_numbers: total,
106 analysis_summary,
107 };
108
109 Ok(vec![LawkitResult::BenfordAnalysis(
110 "benford_analysis".to_string(),
111 benford_data,
112 )])
113}
114
115fn analyze_pareto_principle(data: &Value, _options: &LawkitOptions) -> Result<Vec<LawkitResult>> {
116 let numbers = extract_numbers_from_value(data)?;
117
118 if numbers.is_empty() {
119 return Err(anyhow!("No valid numbers found in input data"));
120 }
121
122 let mut sorted_numbers = numbers.clone();
124 sorted_numbers.sort_by(|a, b| b.partial_cmp(a).unwrap());
125
126 let total_sum: f64 = sorted_numbers.iter().sum();
127 let total_count = sorted_numbers.len();
128 let top_20_count = (total_count as f64 * 0.2).ceil() as usize;
129
130 let top_20_sum: f64 = sorted_numbers.iter().take(top_20_count).sum();
131 let top_20_percent_contribution = (top_20_sum / total_sum) * 100.0;
132
133 let pareto_ratio = top_20_percent_contribution / 80.0; let mut cumulative_sum = 0.0;
138 let mut concentration_index = 0.0;
139 for (i, &value) in sorted_numbers.iter().enumerate() {
140 cumulative_sum += value;
141 let proportion = (i + 1) as f64 / total_count as f64;
142 let cumulative_proportion = cumulative_sum / total_sum;
143 concentration_index += (proportion - cumulative_proportion).abs();
144 }
145 concentration_index /= total_count as f64;
146
147 let risk_level = if top_20_percent_contribution < 60.0 {
149 "LOW" } else if top_20_percent_contribution > 95.0 {
151 "HIGH" } else {
153 "MEDIUM"
154 }
155 .to_string();
156
157 let analysis_summary = format!(
158 "Pareto analysis: top 20% contributes {top_20_percent_contribution:.1}%, concentration index={concentration_index:.3}, risk={risk_level}"
159 );
160
161 let pareto_data = ParetoData {
162 top_20_percent_contribution,
163 pareto_ratio,
164 concentration_index,
165 risk_level,
166 total_items: total_count,
167 analysis_summary,
168 };
169
170 Ok(vec![LawkitResult::ParetoAnalysis(
171 "pareto_analysis".to_string(),
172 pareto_data,
173 )])
174}
175
176fn analyze_zipf_law(data: &Value, _options: &LawkitOptions) -> Result<Vec<LawkitResult>> {
177 let numbers = extract_numbers_from_value(data)?;
178
179 if numbers.is_empty() {
180 return Err(anyhow!("No valid numbers found in input data"));
181 }
182
183 let mut frequency_map: HashMap<String, f64> = HashMap::new();
185 for &num in &numbers {
186 let key = format!("{num:.6}"); *frequency_map.entry(key).or_insert(0.0) += 1.0;
188 }
189
190 let mut frequencies: Vec<f64> = frequency_map.values().cloned().collect();
191 frequencies.sort_by(|a, b| b.partial_cmp(a).unwrap());
192
193 if frequencies.len() < 2 {
194 return Err(anyhow!("Insufficient unique values for Zipf analysis"));
195 }
196
197 let mut log_ranks: Vec<f64> = Vec::new();
199 let mut log_frequencies: Vec<f64> = Vec::new();
200
201 for (rank, &freq) in frequencies.iter().enumerate() {
202 if freq > 0.0 {
203 log_ranks.push((rank + 1) as f64);
204 log_frequencies.push(freq);
205 }
206 }
207
208 let zipf_coefficient = calculate_zipf_coefficient(&log_ranks, &log_frequencies);
209 let correlation = calculate_correlation(&log_ranks, &log_frequencies);
210 let deviation_score = (zipf_coefficient - 1.0).abs(); let risk_level = if deviation_score < 0.2 {
214 "LOW" } else if deviation_score > 0.8 {
216 "HIGH" } else {
218 "MEDIUM"
219 }
220 .to_string();
221
222 let analysis_summary = format!(
223 "Zipf analysis: coefficient={zipf_coefficient:.3}, correlation={correlation:.3}, deviation={deviation_score:.3}, risk={risk_level}"
224 );
225
226 let zipf_data = ZipfData {
227 zipf_coefficient,
228 correlation_coefficient: correlation,
229 deviation_score,
230 risk_level,
231 total_items: frequencies.len(),
232 analysis_summary,
233 };
234
235 Ok(vec![LawkitResult::ZipfAnalysis(
236 "zipf_analysis".to_string(),
237 zipf_data,
238 )])
239}
240
241fn analyze_normal_distribution(
242 data: &Value,
243 _options: &LawkitOptions,
244) -> Result<Vec<LawkitResult>> {
245 let numbers = extract_numbers_from_value(data)?;
246
247 if numbers.is_empty() {
248 return Err(anyhow!("No valid numbers found in input data"));
249 }
250
251 if numbers.len() < 3 {
252 return Err(anyhow!(
253 "Insufficient data points for normal distribution analysis"
254 ));
255 }
256
257 let mean = numbers.iter().sum::<f64>() / numbers.len() as f64;
259 let variance =
260 numbers.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (numbers.len() - 1) as f64;
261 let std_dev = variance.sqrt();
262
263 let skewness = calculate_skewness(&numbers, mean, std_dev);
265 let kurtosis = calculate_kurtosis(&numbers, mean, std_dev);
266
267 let normality_test_p = calculate_normality_p_value(&numbers);
269
270 let risk_level = if normality_test_p > 0.05 && skewness.abs() < 1.0 && kurtosis.abs() < 3.0 {
272 "LOW" } else if normality_test_p < 0.01 || skewness.abs() > 2.0 || kurtosis.abs() > 5.0 {
274 "HIGH" } else {
276 "MEDIUM"
277 }
278 .to_string();
279
280 let analysis_summary = format!(
281 "Normal distribution analysis: mean={mean:.3}, std={std_dev:.3}, skew={skewness:.3}, kurt={kurtosis:.3}, p={normality_test_p:.4}, risk={risk_level}"
282 );
283
284 let normal_data = NormalData {
285 mean,
286 std_dev,
287 skewness,
288 kurtosis,
289 normality_test_p,
290 risk_level,
291 total_numbers: numbers.len(),
292 analysis_summary,
293 };
294
295 Ok(vec![LawkitResult::NormalAnalysis(
296 "normal_analysis".to_string(),
297 normal_data,
298 )])
299}
300
301fn analyze_poisson_distribution(
302 data: &Value,
303 _options: &LawkitOptions,
304) -> Result<Vec<LawkitResult>> {
305 let numbers = extract_numbers_from_value(data)?;
306
307 if numbers.is_empty() {
308 return Err(anyhow!("No valid numbers found in input data"));
309 }
310
311 let integers: Vec<u32> = numbers
313 .iter()
314 .filter_map(|&x| {
315 if x >= 0.0 && x.fract() == 0.0 {
316 Some(x as u32)
317 } else {
318 None
319 }
320 })
321 .collect();
322
323 if integers.is_empty() {
324 return Err(anyhow!(
325 "No valid non-negative integers found for Poisson analysis"
326 ));
327 }
328
329 let lambda = integers.iter().sum::<u32>() as f64 / integers.len() as f64;
331
332 let variance = integers
334 .iter()
335 .map(|&x| (x as f64 - lambda).powi(2))
336 .sum::<f64>()
337 / integers.len() as f64;
338
339 let variance_ratio = variance / lambda;
341
342 let poisson_test_p = calculate_poisson_p_value(variance_ratio, integers.len());
344
345 let risk_level = if (variance_ratio - 1.0).abs() < 0.2 && poisson_test_p > 0.05 {
347 "LOW" } else if (variance_ratio - 1.0).abs() > 0.8 || poisson_test_p < 0.01 {
349 "HIGH" } else {
351 "MEDIUM"
352 }
353 .to_string();
354
355 let analysis_summary = format!(
356 "Poisson distribution analysis: lambda={lambda:.3}, var/mean={variance_ratio:.3}, p={poisson_test_p:.4}, risk={risk_level}"
357 );
358
359 let poisson_data = PoissonData {
360 lambda,
361 variance_ratio,
362 poisson_test_p,
363 risk_level,
364 total_events: integers.len(),
365 analysis_summary,
366 };
367
368 Ok(vec![LawkitResult::PoissonAnalysis(
369 "poisson_analysis".to_string(),
370 poisson_data,
371 )])
372}
373
374fn analyze_all_laws(data: &Value, _options: &LawkitOptions) -> Result<Vec<LawkitResult>> {
375 let mut results = Vec::new();
376 let mut laws_analyzed = Vec::new();
377 let mut overall_risks = Vec::new();
378
379 if let Ok(mut benford_results) = analyze_benford_law(data, _options) {
381 laws_analyzed.push("Benford".to_string());
382 if let Some(LawkitResult::BenfordAnalysis(_, ref benford_data)) = benford_results.first() {
383 overall_risks.push(benford_data.risk_level.clone());
384 }
385 results.append(&mut benford_results);
386 }
387
388 if let Ok(mut pareto_results) = analyze_pareto_principle(data, _options) {
389 laws_analyzed.push("Pareto".to_string());
390 if let Some(LawkitResult::ParetoAnalysis(_, ref pareto_data)) = pareto_results.first() {
391 overall_risks.push(pareto_data.risk_level.clone());
392 }
393 results.append(&mut pareto_results);
394 }
395
396 if let Ok(mut zipf_results) = analyze_zipf_law(data, _options) {
397 laws_analyzed.push("Zipf".to_string());
398 if let Some(LawkitResult::ZipfAnalysis(_, ref zipf_data)) = zipf_results.first() {
399 overall_risks.push(zipf_data.risk_level.clone());
400 }
401 results.append(&mut zipf_results);
402 }
403
404 if let Ok(mut normal_results) = analyze_normal_distribution(data, _options) {
405 laws_analyzed.push("Normal".to_string());
406 if let Some(LawkitResult::NormalAnalysis(_, ref normal_data)) = normal_results.first() {
407 overall_risks.push(normal_data.risk_level.clone());
408 }
409 results.append(&mut normal_results);
410 }
411
412 if let Ok(mut poisson_results) = analyze_poisson_distribution(data, _options) {
413 laws_analyzed.push("Poisson".to_string());
414 if let Some(LawkitResult::PoissonAnalysis(_, ref poisson_data)) = poisson_results.first() {
415 overall_risks.push(poisson_data.risk_level.clone());
416 }
417 results.append(&mut poisson_results);
418 }
419
420 let high_count = overall_risks.iter().filter(|&r| r == "HIGH").count();
422 let medium_count = overall_risks.iter().filter(|&r| r == "MEDIUM").count();
423
424 let overall_risk = if high_count > 0 {
425 "HIGH"
426 } else if medium_count > 0 {
427 "MEDIUM"
428 } else {
429 "LOW"
430 }
431 .to_string();
432
433 let recommendations = generate_recommendations(&laws_analyzed, &overall_risks);
435
436 let analysis_summary = format!(
437 "Integrated analysis of {} laws completed. Overall risk: {overall_risk}",
438 laws_analyzed.len()
439 );
440
441 let integration_data = IntegrationData {
442 laws_analyzed,
443 overall_risk,
444 conflicting_results: Vec::new(), recommendations,
446 analysis_summary,
447 };
448
449 results.push(LawkitResult::IntegrationAnalysis(
450 "integration_analysis".to_string(),
451 integration_data,
452 ));
453
454 Ok(results)
455}
456
457fn validate_data(data: &Value, _options: &LawkitOptions) -> Result<Vec<LawkitResult>> {
458 let numbers = extract_numbers_from_value(data)?;
459
460 let mut issues_found = Vec::new();
461 let mut validation_passed = true;
462
463 if numbers.is_empty() {
465 issues_found.push("No valid numbers found in input data".to_string());
466 validation_passed = false;
467 }
468
469 if numbers.len() < 10 {
471 issues_found.push(format!(
472 "Small sample size: {} (recommended: 10+)",
473 numbers.len()
474 ));
475 validation_passed = false;
476 }
477
478 let infinite_count = numbers.iter().filter(|&&x| x.is_infinite()).count();
480 let nan_count = numbers.iter().filter(|&&x| x.is_nan()).count();
481
482 if infinite_count > 0 {
483 issues_found.push(format!("Found {infinite_count} infinite values"));
484 validation_passed = false;
485 }
486
487 if nan_count > 0 {
488 issues_found.push(format!("Found {nan_count} NaN values"));
489 validation_passed = false;
490 }
491
492 let total_issues = issues_found.len() as f64;
494 let data_quality_score = if numbers.is_empty() {
495 0.0
496 } else {
497 (1.0 - (total_issues / 10.0)).clamp(0.0, 1.0) };
499
500 let analysis_summary = if validation_passed {
501 "Data validation passed successfully".to_string()
502 } else {
503 format!("Data validation failed with {} issues", issues_found.len())
504 };
505
506 let validation_data = ValidationData {
507 validation_passed,
508 issues_found,
509 data_quality_score,
510 analysis_summary,
511 };
512
513 Ok(vec![LawkitResult::ValidationResult(
514 "validation_result".to_string(),
515 validation_data,
516 )])
517}
518
519fn diagnose_data(data: &Value, _options: &LawkitOptions) -> Result<Vec<LawkitResult>> {
520 let numbers = extract_numbers_from_value(data)?;
521
522 if numbers.is_empty() {
523 return Err(anyhow!("No valid numbers found for diagnosis"));
524 }
525
526 let mut findings = Vec::new();
527
528 let mean = numbers.iter().sum::<f64>() / numbers.len() as f64;
530 let median = {
531 let mut sorted = numbers.clone();
532 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
533 sorted[sorted.len() / 2]
534 };
535
536 findings.push(format!("Sample size: {}", numbers.len()));
537 findings.push(format!("Mean: {mean:.3}"));
538 findings.push(format!("Median: {median:.3}"));
539
540 let min_val = numbers.iter().cloned().fold(f64::INFINITY, f64::min);
542 let max_val = numbers.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
543 let range = max_val - min_val;
544
545 findings.push(format!(
546 "Range: {min_val:.3} to {max_val:.3} (span: {range:.3})"
547 ));
548
549 if (mean - median).abs() < 0.1 * mean.abs() {
551 findings.push("Distribution appears symmetric".to_string());
552 } else if mean > median {
553 findings.push("Distribution appears right-skewed".to_string());
554 } else {
555 findings.push("Distribution appears left-skewed".to_string());
556 }
557
558 let q1 = calculate_percentile(&numbers, 0.25);
560 let q3 = calculate_percentile(&numbers, 0.75);
561 let iqr = q3 - q1;
562 let outlier_threshold_low = q1 - 1.5 * iqr;
563 let outlier_threshold_high = q3 + 1.5 * iqr;
564
565 let outliers: Vec<f64> = numbers
566 .iter()
567 .cloned()
568 .filter(|&x| x < outlier_threshold_low || x > outlier_threshold_high)
569 .collect();
570
571 if !outliers.is_empty() {
572 findings.push(format!("Found {} potential outliers", outliers.len()));
573 }
574
575 let confidence_level = if numbers.len() >= 100 {
576 0.95
577 } else if numbers.len() >= 30 {
578 0.80
579 } else {
580 0.60
581 };
582
583 let analysis_summary = format!(
584 "Diagnostic analysis completed with {} findings (confidence: {:.0}%)",
585 findings.len(),
586 confidence_level * 100.0
587 );
588
589 let diagnostic_data = DiagnosticData {
590 diagnostic_type: "General".to_string(),
591 findings,
592 confidence_level,
593 analysis_summary,
594 };
595
596 Ok(vec![LawkitResult::DiagnosticResult(
597 "diagnostic_result".to_string(),
598 diagnostic_data,
599 )])
600}
601
602fn generate_sample_data(config: &Value, _options: &LawkitOptions) -> Result<Vec<LawkitResult>> {
603 use crate::generate::{
604 BenfordGenerator, DataGenerator, GenerateConfig, NormalGenerator, ParetoGenerator,
605 PoissonGenerator, ZipfGenerator,
606 };
607
608 let data_type = config
610 .get("type")
611 .and_then(|v| v.as_str())
612 .unwrap_or("benford");
613
614 let count = config.get("count").and_then(|v| v.as_u64()).unwrap_or(1000) as usize;
615 let seed = config.get("seed").and_then(|v| v.as_u64());
616
617 let mut gen_config = GenerateConfig::new(count);
618 if let Some(s) = seed {
619 gen_config = gen_config.with_seed(s);
620 }
621
622 let mut parameters = HashMap::new();
623 let sample_data = match data_type {
624 "benford" | "benf" => {
625 let min_value = config.get("min").and_then(|v| v.as_f64()).unwrap_or(1.0);
626 let max_value = config
627 .get("max")
628 .and_then(|v| v.as_f64())
629 .unwrap_or(100000.0);
630 parameters.insert("min".to_string(), min_value);
631 parameters.insert("max".to_string(), max_value);
632 let generator = BenfordGenerator::new(min_value, max_value);
633 generator.generate(&gen_config)?
634 }
635 "pareto" => {
636 let alpha = config.get("alpha").and_then(|v| v.as_f64()).unwrap_or(1.16);
637 let x_m = config.get("x_m").and_then(|v| v.as_f64()).unwrap_or(1.0);
638 parameters.insert("alpha".to_string(), alpha);
639 parameters.insert("x_m".to_string(), x_m);
640 let generator = ParetoGenerator::new(alpha, x_m);
641 generator.generate(&gen_config)?
642 }
643 "zipf" => {
644 let s = config.get("s").and_then(|v| v.as_f64()).unwrap_or(1.0);
645 let n = config.get("n").and_then(|v| v.as_u64()).unwrap_or(1000) as usize;
646 parameters.insert("s".to_string(), s);
647 parameters.insert("n".to_string(), n as f64);
648 let generator = ZipfGenerator::new(s, n);
649 generator
650 .generate(&gen_config)?
651 .into_iter()
652 .map(|x| x as f64)
653 .collect()
654 }
655 "normal" => {
656 let mean = config.get("mean").and_then(|v| v.as_f64()).unwrap_or(0.0);
657 let std_dev = config
658 .get("std_dev")
659 .and_then(|v| v.as_f64())
660 .unwrap_or(1.0);
661 parameters.insert("mean".to_string(), mean);
662 parameters.insert("std_dev".to_string(), std_dev);
663 let generator = NormalGenerator::new(mean, std_dev);
664 generator.generate(&gen_config)?
665 }
666 "poisson" => {
667 let lambda = config.get("lambda").and_then(|v| v.as_f64()).unwrap_or(5.0);
668 let time_series = config
669 .get("time_series")
670 .and_then(|v| v.as_bool())
671 .unwrap_or(false);
672 parameters.insert("lambda".to_string(), lambda);
673 let generator = PoissonGenerator::new(lambda, time_series);
674 generator
675 .generate(&gen_config)?
676 .into_iter()
677 .map(|x| x as f64)
678 .collect()
679 }
680 _ => return Err(anyhow!("Unknown data type for generation: {}", data_type)),
681 };
682
683 let generated_info = GeneratedDataInfo {
684 data_type: data_type.to_string(),
685 count,
686 parameters,
687 sample_data,
688 };
689
690 Ok(vec![LawkitResult::GeneratedData(
691 "generated_data".to_string(),
692 generated_info,
693 )])
694}
695
696use helpers::*;
698
699pub use helpers::format_output;
701pub use parsers::{parse_csv, parse_ini, parse_json, parse_toml, parse_xml, parse_yaml};
702
703pub mod types;
709
710pub mod helpers;
712pub mod parsers;
713
714pub mod diffx_core_mock;
716
717pub mod common;
718pub mod core;
719pub mod error;
720pub mod generate;
721pub mod laws;