dataprof 0.7.1

High-performance data profiler with ISO 8000/25012 quality metrics for CSV, JSON/JSONL, and Parquet files
Documentation
//! Shared core logic for all CLI commands
//!
//! This module contains the common analysis logic that all subcommands inherit.
//! Benefits:
//! - Single source of truth for profiler configuration
//! - All commands automatically get improvements (progress, config, robust parsing)
//! - No code duplication across command implementations

use anyhow::Result;
use std::path::Path;

use std::sync::Arc;

use dataprof::{
    ChunkSize, EngineType, MetricPack, Profiler, ProgressEvent, ProgressSink,
    core::{DataprofConfig, sampling::SamplingStrategy},
    types::ProfileReport,
};

/// Common options that all analysis commands should support
#[derive(Default)]
pub struct AnalysisOptions {
    /// Show real-time progress bars
    pub progress: bool,
    /// Custom chunk size for streaming
    pub chunk_size: Option<usize>,
    /// Config file path
    pub config: Option<std::path::PathBuf>,
    /// Sample size for large files
    pub sample: Option<usize>,
    /// Verbosity level (0=quiet, 1=normal, 2=verbose, 3=debug)
    pub verbosity: Option<u8>,
    /// Metric packs to compute (None = all)
    pub metric_packs: Option<Vec<MetricPack>>,
    /// ISO 3166-1 alpha-2 locale for pattern detection (e.g. "IT", "US")
    pub locale: Option<String>,
}

/// Builder for creating a properly configured profiler with all improvements
pub struct ProfilerBuilder {
    options: AnalysisOptions,
    #[allow(dead_code)]
    config: DataprofConfig,
}

impl ProfilerBuilder {
    /// Create a new profiler builder with options and config
    pub fn new(options: AnalysisOptions, config: DataprofConfig) -> Self {
        Self { options, config }
    }

    /// Build a configured profiler with all enhancements
    ///
    /// Uses Incremental engine to ensure chunk_size, sampling, and progress
    /// callbacks are properly forwarded.
    pub fn build_streaming(&self, _file_path: &Path) -> Result<Profiler> {
        let mut profiler = Profiler::new().engine(EngineType::Incremental);

        // Configure chunk size (from CLI arg or config)
        let chunk_size = if let Some(size) = self.options.chunk_size {
            ChunkSize::Fixed(size)
        } else {
            ChunkSize::Adaptive
        };
        profiler = profiler.chunk_size(chunk_size);

        // Configure sampling strategy
        if let Some(sample_size) = self.options.sample {
            profiler = profiler.sampling(SamplingStrategy::Random { size: sample_size });
        }

        // Configure metric packs if specified
        if let Some(ref packs) = self.options.metric_packs {
            profiler = profiler.metric_packs(packs.clone());
        }

        // Configure locale for pattern detection
        if let Some(ref locale) = self.options.locale {
            profiler = profiler.locale(locale);
        }

        // Enable progress if requested
        if self.options.progress {
            let sink = ProgressSink::Callback(Arc::new(|event: ProgressEvent| {
                if let ProgressEvent::ChunkProcessed {
                    rows_processed,
                    percentage,
                    processing_speed,
                    ..
                } = event
                {
                    print!(
                        "\rProcessing: {:.1}% ({} rows, {:.1} rows/sec)",
                        percentage.unwrap_or(0.0),
                        rows_processed,
                        processing_speed
                    );
                    let _ = std::io::Write::flush(&mut std::io::stdout());
                }
            }));
            profiler = profiler.progress_sink(sink);
        }

        Ok(profiler)
    }
}

/// High-level function to analyze a file with all improvements
///
/// This function:
/// - Detects file format (CSV, JSON, JSONL, Parquet)
/// - Loads config file if specified
/// - Configures profiler with progress, chunk size, etc.
/// - Uses robust parsing with fallback
/// - Returns quality report with ISO metrics
pub fn analyze_file_with_options(
    file_path: &Path,
    options: AnalysisOptions,
) -> Result<ProfileReport> {
    // Load config (from CLI arg, auto-discover, or use default)
    let config = if let Some(config_path) = &options.config {
        // Explicit config file path provided via CLI
        match DataprofConfig::load_from_file(config_path) {
            Ok(cfg) => {
                log::info!("Loaded configuration from: {}", config_path.display());
                cfg
            }
            Err(e) => {
                log::warn!(
                    "Failed to load config from {}: {}. Using defaults.",
                    config_path.display(),
                    e
                );
                DataprofConfig::default()
            }
        }
    } else {
        // No explicit config, try auto-discovery
        DataprofConfig::load_with_discovery()
    };

    // Override verbosity from CLI if provided (using shadowing pattern)
    let config = if let Some(verbosity) = options.verbosity {
        let mut updated_config = config;
        updated_config.output.verbosity = verbosity;
        updated_config
    } else {
        config
    };

    // Detect file format and route to appropriate parser
    let is_parquet = super::commands::is_parquet_file(file_path);

    if super::commands::is_json_file(file_path) {
        // JSON files: use specialized JSON parser
        Ok(dataprof::parsers::json::analyze_json_file(
            file_path,
            &dataprof::parsers::json::JsonParserConfig::default(),
        )?)
    } else if is_parquet {
        // Parquet files: use Parquet parser
        Ok(dataprof::analyze_parquet_with_quality(file_path)?)
    } else {
        // CSV files: try streaming profiler, fallback to robust parser
        let builder = ProfilerBuilder::new(options, config.clone());
        let verbosity = config.output.verbosity;

        // Try streaming profiler first
        match builder.build_streaming(file_path) {
            Ok(profiler) => {
                match profiler.analyze_file(file_path) {
                    Ok(report) => {
                        // Clear progress line if it was shown
                        if builder.options.progress {
                            println!();
                        }
                        Ok(report)
                    }
                    Err(e) => {
                        // Streaming failed, try robust CSV parser with flexible mode
                        // Only show this warning at verbose level (actual failures shown regardless)
                        if verbosity >= 2 {
                            eprintln!("Streaming analysis failed: {}. Trying robust parser...", e);
                        }
                        Ok(dataprof::parsers::csv::analyze_csv_file(
                            file_path,
                            &dataprof::parsers::csv::CsvParserConfig::default(),
                        )?)
                    }
                }
            }
            Err(e) => {
                // Build failed, try robust CSV parser
                // Only show this warning at verbose level
                if verbosity >= 2 {
                    eprintln!(
                        "Profiler initialization failed: {}. Trying robust parser...",
                        e
                    );
                }
                Ok(dataprof::parsers::csv::analyze_csv_file(
                    file_path,
                    &dataprof::parsers::csv::CsvParserConfig::default(),
                )?)
            }
        }
    }
}