lk-inside 0.3.1

use lk_inside::data_core::loader;
use lk_inside::data_core::analyzer::DataFrameAnalyzer;
use std::path::PathBuf;
use anyhow::Result;
use polars::prelude::{col, lit, *}; // Use wildcard import for all prelude items, and add col, lit

#[tokio::test]
async fn test_csv_data_loading_and_validation() -> Result<()> {
    let file_path = PathBuf::from("examples/large_sample.csv");
    let format = loader::detect_file_format(&file_path);
    let df = loader::load_data(file_path.clone(), format).await?;

    loader::validate_dataframe(&df)?;

    assert_eq!(df.height(), 20); // Based on large_sample.csv content
    assert_eq!(df.width(), 5); // id, name, value, category, timestamp
    Ok(())
}

#[tokio::test]
async fn test_json_data_loading_and_validation() -> Result<()> {
    let file_path = PathBuf::from("examples/sample_data.json");
    let format = loader::detect_file_format(&file_path);
    let df = loader::load_data(file_path.clone(), format).await?;

    loader::validate_dataframe(&df)?;

    assert_eq!(df.height(), 3); // Based on sample_data.json content
    assert_eq!(df.width(), 3); // Based on sample_data.json content
    Ok(())
}

#[tokio::test]
async fn test_data_ranking() -> Result<()> {
    let file_path = PathBuf::from("examples/large_sample.csv");
    let format = loader::detect_file_format(&file_path);
    let df = loader::load_data(file_path.clone(), format).await?;

    loader::validate_dataframe(&df)?;

    let analyzer = DataFrameAnalyzer::new(df); // Ensure analyzer is defined
    let ranked_df = analyzer.rank_by_column("value")?; // RE-INSERTED THIS LINE

    assert_eq!(ranked_df.height(), 20); // Should still have 20 rows
    assert_eq!(ranked_df.width(), 5); // Should still have 5 columns

    // Assert that the 'value' column is sorted in descending order
    let values: Vec<f64> = ranked_df.column("value")?
        .f64()?
        .into_iter()
        .filter_map(|x| x)
        .collect();
    
    // Expected sorted values from large_sample.csv 'value' column in descending order
    assert_eq!(values, vec![310.10, 300.10, 260.00, 250.00, 210.75, 200.75, 185.60, 175.60, 160.20, 150.20, 130.90, 120.90, 110.50, 100.50, 100.25, 90.30, 90.25, 80.30, 60.00, 50.00]); 

    Ok(())
}

#[tokio::test] // Make it async
async fn test_descriptive_statistics() -> Result<()> {
    let file_path = PathBuf::from("examples/sample_data.csv");
    let format = loader::detect_file_format(&file_path);
    let df = loader::load_data(file_path.clone(), format).await?; // Await the async call

    loader::validate_dataframe(&df)?; // No REQUIRED_COLUMNS argument

    let analyzer = DataFrameAnalyzer::new(df);
    let stats_df = analyzer.get_descriptive_statistics()?;

    // Assertions based on sample_data.csv:
    // id: 1,2,3 -> count=3, mean=2, std=1, min=1, max=3, nulls=0
    // name: "A","B","C" -> count=3, nulls=0 (no stats for strings)
    // price: 10.0, 20.0, 15.0 -> count=3, mean=15, std=5, min=10, max=20, nulls=0

    // Expected columns: 'statistic', 'id', 'price' (name is string, so no stats)
    assert_eq!(stats_df.height(), 7); // Changed from 9 to 7
    assert!(stats_df.width() >= 3, "Expected at least 3 columns (statistic, id, price), got {}", stats_df.width()); // statistic, id, price

    // Check some specific values for the 'mean' statistic
    let mean_row_df = stats_df
        .lazy()
        .filter(col("Measure").eq(lit("mean"))) // Corrected filtering logic
        .collect()?;

    let id_mean_val = mean_row_df.column("id")?.f64()?.get(0);
    assert_eq!(id_mean_val, Some(2.0));

    let price_mean_val = mean_row_df.column("price")?.f64()?.get(0);
    assert_eq!(price_mean_val, Some(15.0));

    Ok(())
}

#[test]
fn test_data_grouping_and_aggregation() -> Result<()> {
    // Create a dummy DataFrame
    let df = DataFrame::new(vec![
        polars::prelude::Series::new("category".into(), &["A", "B", "A", "C", "B"]).into(),
        polars::prelude::Series::new("value".into(), &[10, 20, 15, 5, 25]).into(),
    ])?;

    let analyzer = DataFrameAnalyzer::new(df);

    // Call the (to-be-created) group_and_aggregate method
    // This will initially cause a compilation error.
    let grouped_df = analyzer.group_and_aggregate(
        "category",
        &[("value", "sum")] // Aggregate 'value' column by 'sum'
    )?;

    // Expected output:
    // +----------+-------+
    // | category | value |
    // +----------+-------+
    // | A        | 25    |
    // | B        | 45    |
    // | C        | 5    |
    // +----------+-------+

    assert_eq!(grouped_df.height(), 3);
    assert_eq!(grouped_df.column("category").unwrap().str().unwrap().sort(false).into_series(), polars::prelude::Series::new("category".into(), &["A", "B", "C"]));
    assert_eq!(grouped_df.column("value").unwrap().i32().unwrap().sort(false).into_series(), polars::prelude::Series::new("value".into(), &[5, 25, 45]));

    Ok(())
}

#[tokio::test]
async fn test_data_filtering() -> Result<()> {
    use lk_inside::analysis::filtering; // Import filtering module

    let file_path = PathBuf::from("examples/sample_data.csv");
    let format = loader::detect_file_format(&file_path);
    let df = loader::load_data(file_path.clone(), format).await?;

    let filtered_df = filtering::apply_filter(&df, "price > 10.0")?;

    assert_eq!(filtered_df.height(), 2);
    // Optionally check specific values in the filtered DataFrame
    assert_eq!(filtered_df.column("id")?.i64()?.get(0), Some(2));
    assert_eq!(filtered_df.column("id")?.i64()?.get(1), Some(3));
    assert_eq!(filtered_df.column("price")?.f64()?.get(0), Some(20.0));
    assert_eq!(filtered_df.column("price")?.f64()?.get(1), Some(15.0));

    Ok(())
}