alimentar 0.2.8

Data Loading, Distribution and Tooling in Pure Rust
Documentation
//! `HuggingFace` Hub Publishing Example with Quality Validation
//!
//! This example demonstrates the CRITICAL importance of data quality
//! validation before publishing to `HuggingFace` Hub.
//!
//! # WARNING: Data Quality is NON-NEGOTIABLE
//!
//! Publishing low-quality datasets to `HuggingFace` is HARMFUL:
//! - Models learn incorrect patterns from garbage data
//! - Compute resources wasted on training with bad data
//! - Downstream models inherit quality problems
//! - Trust in the dataset ecosystem is eroded
//!
//! # Run this example:
//! ```bash
//! cargo run --example hub_publishing --features hf-hub
//! ```

/// Minimum acceptable quality score (Grade B)
const MIN_QUALITY_SCORE: f64 = 85.0;

/// Quality thresholds for grading
mod quality_grades {
    pub const GRADE_A: f64 = 95.0;
    pub const GRADE_B: f64 = 85.0;
    pub const GRADE_C: f64 = 70.0;
}

fn main() {
    println!("╔══════════════════════════════════════════════════════════════════╗");
    println!("║          HUGGINGFACE HUB PUBLISHING WITH QUALITY GATES           ║");
    println!("╠══════════════════════════════════════════════════════════════════╣");
    println!("║  WARNING: Publishing low-quality data is HARMFUL to ML community ║");
    println!("║  ALWAYS validate quality score before publishing ANY dataset     ║");
    println!("╚══════════════════════════════════════════════════════════════════╝");
    println!();

    // Example workflow - in real usage, replace with actual paths
    let parquet_path = "data.parquet";
    let repo_id = "your-org/your-dataset";

    println!("=== STEP 1: Quality Validation (MANDATORY) ===");
    println!();

    // Simulate quality score check
    let quality_score = simulate_quality_check(parquet_path);

    print_quality_report(quality_score);

    if quality_score < MIN_QUALITY_SCORE {
        println!();
        println!("╔══════════════════════════════════════════════════════════════════╗");
        println!("║                    ⛔ UPLOAD BLOCKED ⛔                          ║");
        println!("╠══════════════════════════════════════════════════════════════════╣");
        println!("║  Quality score {quality_score:.1}% is below minimum threshold of {MIN_QUALITY_SCORE}%          ║");
        println!("║                                                                  ║");
        println!("║  FIX YOUR DATA BEFORE PUBLISHING:                                ║");
        println!("║                                                                  ║");
        println!("║  Recipe 1: Clean with aprender                                   ║");
        println!("║    $ aprender clean input.parquet --output cleaned.parquet       ║");
        println!("║      --remove-nulls --deduplicate --validate-types               ║");
        println!("║                                                                  ║");
        println!("║  Recipe 2: Augment with entrenar                                 ║");
        println!("║    $ entrenar augment input.parquet --output augmented.parquet   ║");
        println!("║      --balance-classes --normalize                               ║");
        println!("║                                                                  ║");
        println!("║  Recipe 3: Manual inspection                                     ║");
        println!("║    $ alimentar head input.parquet --rows 100                     ║");
        println!("║    $ alimentar quality score input.parquet --verbose             ║");
        println!("╚══════════════════════════════════════════════════════════════════╝");

        std::process::exit(1);
    }

    println!();
    println!("=== STEP 2: Dataset Card Validation ===");
    println!();

    let readme_content = generate_readme_template(quality_score);
    validate_readme(&readme_content);

    println!();
    println!("=== STEP 3: Upload to HuggingFace Hub ===");
    println!();

    // This is where actual upload would happen
    println!("Would upload to: {repo_id}");
    println!("  - Parquet file: {parquet_path}");
    println!("  - README.md with quality score: {quality_score:.1}%");
    println!();

    print_upload_command(parquet_path, repo_id);

    println!();
    println!("=== QUALITY IMPROVEMENT RECIPES ===");
    println!();
    print_quality_recipes();
}

/// Simulates quality check - replace with actual alimentar quality API
fn simulate_quality_check(path: &str) -> f64 {
    println!("Checking quality of: {path}");
    println!();

    // In real code, use alimentar's quality API:
    // let dataset = ArrowDataset::from_parquet(path)?;
    // let score = quality::score_dataset(&dataset)?;

    // Simulated checks
    let checks = vec![
        ("Schema Validity", 100.0, "All columns have valid types"),
        ("Completeness", 95.0, "5% null values in optional columns"),
        ("Uniqueness", 92.0, "8% duplicate rows detected"),
        ("Consistency", 88.0, "Some format inconsistencies"),
        ("Value Range", 90.0, "Values within expected bounds"),
    ];

    let mut total = 0.0;
    for (name, score, detail) in &checks {
        let status = if *score >= 90.0 {
            ""
        } else if *score >= 70.0 {
            ""
        } else {
            ""
        };
        println!("  {status} {name}: {score:.0}% - {detail}");
        total += score;
    }

    #[allow(clippy::cast_precision_loss)]
    {
        total / checks.len() as f64
    }
}

/// Print quality report with grade
fn print_quality_report(score: f64) {
    println!();
    println!("┌─────────────────────────────────────┐");

    let grade = if score >= quality_grades::GRADE_A {
        ("A", "Excellent - Safe to publish")
    } else if score >= quality_grades::GRADE_B {
        ("B", "Good - Review warnings first")
    } else if score >= quality_grades::GRADE_C {
        ("C", "Poor - DO NOT PUBLISH")
    } else {
        ("D", "Failed - Major quality issues")
    };

    println!("│  Quality Score: {:.1}% (Grade {})    │", score, grade.0);
    println!("│  Status: {}", grade.1);
    println!("└─────────────────────────────────────┘");
}

/// Generate README template with quality score
fn generate_readme_template(quality_score: f64) -> String {
    format!(
        r#"---
license: apache-2.0
task_categories:
  - text-generation
language:
  - en
tags:
  - code
  - quality-validated
size_categories:
  - n<1K
---

# Dataset Name

Quality Score: {:.1}% (validated with alimentar)

## Description

Describe your dataset here.

## Quality Validation

This dataset was validated using alimentar's quality scoring system:

- **Grade**: {}
- **Score**: {:.1}%
- **Validation Date**: {}

## Usage

```python
from datasets import load_dataset

dataset = load_dataset("your-org/your-dataset")
```

## License

Apache 2.0
"#,
        quality_score,
        if quality_score >= 95.0 {
            "A"
        } else if quality_score >= 85.0 {
            "B"
        } else {
            "C"
        },
        quality_score,
        chrono::Utc::now().format("%Y-%m-%d")
    )
}

/// Validate README against `HuggingFace` requirements
fn validate_readme(content: &str) {
    println!("Validating dataset card...");

    // Check required fields
    let checks = vec![
        ("license field", content.contains("license:")),
        ("task_categories", content.contains("task_categories:")),
        (
            "valid task category",
            content.contains("text-generation") || content.contains("translation"),
        ),
        ("size_categories", content.contains("size_categories:")),
    ];

    for (name, passed) in &checks {
        let status = if *passed { "" } else { "" };
        println!("  {status} {name}");
    }

    // In real code, use alimentar's validator:
    // DatasetCardValidator::validate_readme_strict(content)?;
}

/// Print the actual CLI command
fn print_upload_command(parquet_path: &str, repo_id: &str) {
    println!("To upload, run:");
    println!();
    println!("  # Set your HuggingFace token");
    println!("  export HF_TOKEN=\"hf_xxxxx\"");
    println!();
    println!("  # Upload with quality-validated README");
    println!("  alimentar hub push {parquet_path} {repo_id} \\");
    println!("    --readme README.md \\");
    println!("    --message \"Quality-validated upload\"");
}

/// Print quality improvement recipes
fn print_quality_recipes() {
    println!("┌──────────────────────────────────────────────────────────────────┐");
    println!("│              DATA QUALITY IMPROVEMENT RECIPES                    │");
    println!("├──────────────────────────────────────────────────────────────────┤");
    println!("│                                                                  │");
    println!("│  RECIPE 1: Clean with aprender (data cleaning)                   │");
    println!("│  ─────────────────────────────────────────────                   │");
    println!("│  $ cargo install aprender                                        │");
    println!("│  $ aprender clean input.parquet -o cleaned.parquet \\");
    println!("│      --remove-nulls \\");
    println!("│      --deduplicate \\");
    println!("│      --validate-types \\");
    println!("│      --normalize-text                                            │");
    println!("│                                                                  │");
    println!("│  RECIPE 2: Augment with entrenar (ML training prep)              │");
    println!("│  ─────────────────────────────────────────────────               │");
    println!("│  $ cargo install entrenar                                        │");
    println!("│  $ entrenar augment input.parquet -o augmented.parquet \\");
    println!("│      --balance-classes \\");
    println!("│      --synthetic-samples 1000 \\");
    println!("│      --validate-output                                           │");
    println!("│                                                                  │");
    println!("│  RECIPE 3: Full Pipeline                                         │");
    println!("│  ────────────────────────                                        │");
    println!("│  $ alimentar quality score input.parquet                         │");
    println!("│  $ aprender clean input.parquet -o /tmp/cleaned.parquet          │");
    println!("│  $ entrenar augment /tmp/cleaned.parquet -o output.parquet       │");
    println!("│  $ alimentar quality score output.parquet                        │");
    println!("│  $ alimentar hub push output.parquet org/dataset                 │");
    println!("│                                                                  │");
    println!("│  RECIPE 4: Quality Profile Validation                            │");
    println!("│  ───────────────────────────────────                             │");
    println!("│  $ alimentar quality score data.parquet \\");
    println!("│      --profile ml-training                                       │");
    println!("│  $ alimentar quality score data.parquet \\");
    println!("│      --profile doctest-corpus                                    │");
    println!("│                                                                  │");
    println!("└──────────────────────────────────────────────────────────────────┘");
}

// Stub for chrono - in real code this would use the chrono crate
mod chrono {
    pub struct Utc;
    impl Utc {
        pub fn now() -> Self {
            Self
        }
        #[allow(clippy::unused_self)]
        pub fn format(&self, _: &str) -> &'static str {
            "2025-11-29"
        }
    }
}