dataprof 0.8.0

High-performance data profiler with ISO 8000/25012 quality metrics for CSV, JSON/JSONL, and Parquet files
Documentation
use std::io::Write;

use dataprof::{ColumnStats, DataType, EngineType, Profiler};
use tempfile::NamedTempFile;

fn write_csv(content: &str) -> NamedTempFile {
    let mut file = NamedTempFile::with_suffix(".csv").unwrap();
    write!(file, "{content}").unwrap();
    file.flush().unwrap();
    file
}

#[test]
fn positive_columns_drive_negative_values_in_positive() {
    let csv = write_csv("pressure,temperature_delta\n101325,1\n-500,-2\n100900,3\n");

    let without_hint = Profiler::new()
        .engine(EngineType::Incremental)
        .analyze_file(csv.path())
        .expect("profile should succeed");
    assert_eq!(
        without_hint
            .quality
            .as_ref()
            .unwrap()
            .metrics
            .negative_values_in_positive(),
        0
    );

    let with_hint = Profiler::new()
        .engine(EngineType::Incremental)
        .positive_columns(vec!["pressure".to_string()])
        .analyze_file(csv.path())
        .expect("profile should succeed");
    assert_eq!(
        with_hint
            .quality
            .as_ref()
            .unwrap()
            .metrics
            .negative_values_in_positive(),
        1
    );
}

#[test]
fn identifier_columns_are_semantic_strings_and_excluded_from_outliers() {
    let csv = write_csv("order_id\n1\n2\n3\n10000\n");

    let without_hint = Profiler::new()
        .engine(EngineType::Incremental)
        .analyze_file(csv.path())
        .expect("profile should succeed");
    let plain_id = without_hint
        .column_profiles
        .iter()
        .find(|col| col.name == "order_id")
        .unwrap();
    assert_eq!(plain_id.data_type, DataType::Integer);
    assert!(matches!(plain_id.stats, ColumnStats::Numeric(_)));
    assert!(
        without_hint
            .quality
            .as_ref()
            .unwrap()
            .metrics
            .outlier_ratio()
            > 0.0
    );

    let with_hint = Profiler::new()
        .engine(EngineType::Incremental)
        .identifier_columns(vec!["order_id".to_string()])
        .analyze_file(csv.path())
        .expect("profile should succeed");
    let hinted_id = with_hint
        .column_profiles
        .iter()
        .find(|col| col.name == "order_id")
        .unwrap();
    assert_eq!(hinted_id.data_type, DataType::Identifier);
    assert!(matches!(hinted_id.stats, ColumnStats::Text(_)));
    assert_eq!(
        with_hint.quality.as_ref().unwrap().metrics.outlier_ratio(),
        0.0
    );
}