bashrs 6.66.0 - Docs.rs

//! # Dataset Export for Corpus Results (§10.3)
//!
//! Exports corpus results in portable formats (JSON, CSV) following the
//! Hugging Face dataset schema defined in §10.3. Enables reproducibility,
//! training data generation, and cross-project benchmarking.

use crate::corpus::registry::{CorpusEntry, CorpusFormat, CorpusRegistry};
use crate::corpus::runner::{CorpusResult, CorpusRunner, CorpusScore};
use crate::Config;
use std::collections::HashMap;
use std::fmt;

/// Export format for corpus data
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ExportFormat {
    /// JSON Lines (one JSON object per line)
    JsonLines,
    /// CSV with headers
    Csv,
    /// JSON array (pretty-printed)
    Json,
    /// Classification JSONL for ML fine-tuning ({"input":"...","label":N})
    Classification,
    /// Multi-label classification JSONL ({"input":"...","labels":[0.0, 1.0, ...]})
    MultiLabelClassification,
}

impl fmt::Display for ExportFormat {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::JsonLines => write!(f, "jsonl"),
            Self::Csv => write!(f, "csv"),
            Self::Json => write!(f, "json"),
            Self::Classification => write!(f, "classification"),
            Self::MultiLabelClassification => write!(f, "multi-label-classification"),
        }
    }
}

/// A single row in the exported dataset (§10.3 schema)
#[derive(Debug, Clone, serde::Serialize)]
pub struct DatasetRow {
    pub id: String,
    pub name: String,
    pub tier: u8,
    pub format: String,
    pub input_rust: String,
    pub expected_output: String,
    pub actual_output: String,
    pub transpiled: bool,
    pub output_correct: bool,
    pub lint_clean: bool,
    pub deterministic: bool,
    pub score: f64,
    pub grade: String,
    pub safety_index: u8,
    pub safety_label: String,
    pub bashrs_version: String,
    pub commit_sha: String,
    pub date: String,
}

/// Lightweight classification row for ML training (entrenar-compatible).
///
/// Format: `{"input": "<shell script>", "label": N}` where N is 0-4.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct ClassificationRow {
    pub input: String,
    pub label: u8,
}

/// Result of pre-training data validation.
///
/// Catches data quality issues that cause training divergence BEFORE
/// burning GPU time. Every export path must call `validate_export()`
/// and abort if `!result.passed`.
#[derive(Debug)]
pub struct ExportValidation {
    pub passed: bool,
    pub total: usize,
    pub num_classes: usize,
    pub class_counts: [usize; 5],
    pub errors: Vec<String>,
    pub warnings: Vec<String>,
}

/// Validate exported classification data for training readiness.
///
/// `expected_classes` is the number of classes the model head will have.
/// For binary safe/unsafe, pass 2. For full taxonomy, pass 5.
///
/// Checks (all must pass):
/// 1. **No missing classes**: all classes 0..expected_classes have >=1 sample
/// 2. **No extreme imbalance**: no single class >95% of total
/// 3. **No preamble contamination**: no inputs start with `#!/bin/sh` or `set -euf`
/// 4. **No length confound**: max avg length / min avg length < 10x across classes
/// 5. **No trivial inputs**: no inputs shorter than 3 chars
///
/// Returns `ExportValidation` with `passed=true` only if all checks pass.
pub fn validate_export(rows: &[ClassificationRow], expected_classes: u8) -> ExportValidation {
    let stats = collect_export_stats(rows);
    let total = rows.len();
    let num_classes = stats.present_classes.len();
    let mut errors = Vec::new();
    let mut warnings = Vec::new();

    check_missing_classes(&stats, expected_classes, num_classes, &mut errors);
    check_class_imbalance(&stats.class_counts, total, &mut errors, &mut warnings);
    check_preamble_contamination(stats.preamble_count, total, &mut errors);
    check_length_confound(&stats, &mut errors, &mut warnings);
    if stats.trivial_count > 0 {
        warnings.push(format!(
            "trivial inputs: {} samples have <3 chars",
            stats.trivial_count
        ));
    }

    ExportValidation {
        passed: errors.is_empty(),
        total,
        num_classes,
        class_counts: stats.class_counts,
        errors,
        warnings,
    }
}

/// Aggregated statistics from a classification export.
struct ExportStats {
    class_counts: [usize; 5],
    class_total_len: [u64; 5],
    present_classes: Vec<u8>,
    preamble_count: usize,
    trivial_count: usize,
}

fn collect_export_stats(rows: &[ClassificationRow]) -> ExportStats {
    let mut class_counts = [0usize; 5];
    let mut class_total_len = [0u64; 5];
    let mut preamble_count = 0usize;
    let mut trivial_count = 0usize;

    for row in rows {
        let idx = row.label as usize;
        if idx < 5 {
            class_counts[idx] += 1;
            class_total_len[idx] += row.input.len() as u64;
        }
        let trimmed = row.input.trim();
        if trimmed.starts_with("#!/bin/sh")
            || trimmed.starts_with("set -euf")
            || trimmed.starts_with("IFS='")
            || trimmed.starts_with("export LC_ALL=C")
        {
            preamble_count += 1;
        }
        if trimmed.len() < 3 {
            trivial_count += 1;
        }
    }

    let present_classes: Vec<u8> = class_counts
        .iter()
        .enumerate()
        .filter(|(_, &c)| c > 0)
        .map(|(i, _)| i as u8)
        .collect();

    ExportStats {
        class_counts,
        class_total_len,
        present_classes,
        preamble_count,
        trivial_count,
    }
}

fn check_missing_classes(
    stats: &ExportStats,
    expected: u8,
    num_present: usize,
    errors: &mut Vec<String>,
) {
    let missing: Vec<u8> = (0..expected)
        .filter(|i| stats.class_counts[*i as usize] == 0)
        .collect();
    if !missing.is_empty() {
        errors.push(format!(
            "missing classes {:?} — model head has {} outputs but only {} classes present",
            missing, expected, num_present
        ));
    }
}

fn check_class_imbalance(
    class_counts: &[usize; 5],
    total: usize,
    errors: &mut Vec<String>,
    warnings: &mut Vec<String>,
) {
    if let Some(&max_count) = class_counts.iter().max() {
        if total > 0 {
            let max_pct = 100.0 * max_count as f64 / total as f64;
            if max_pct > 95.0 {
                errors.push(format!(
                    "extreme class imbalance: dominant class has {:.1}% of samples — \
                     model will learn majority-class prediction, not safety features",
                    max_pct
                ));
            } else if max_pct > 85.0 {
                warnings.push(format!(
                    "class imbalance: dominant class has {:.1}% — consider oversampling minorities",
                    max_pct
                ));
            }
        }
    }
}

fn check_preamble_contamination(preamble_count: usize, total: usize, errors: &mut Vec<String>) {
    if preamble_count > 0 {
        errors.push(format!(
            "preamble contamination: {preamble_count}/{total} inputs start with shell boilerplate"
        ));
    }
}

fn check_length_confound(
    stats: &ExportStats,
    errors: &mut Vec<String>,
    warnings: &mut Vec<String>,
) {
    let avg_lens: Vec<f64> = stats
        .present_classes
        .iter()
        .map(|&c| {
            let idx = c as usize;
            if stats.class_counts[idx] > 0 {
                stats.class_total_len[idx] as f64 / stats.class_counts[idx] as f64
            } else {
                0.0
            }
        })
        .collect();

    if avg_lens.len() < 2 {
        return;
    }
    let min_avg = avg_lens.iter().copied().fold(f64::MAX, f64::min);
    let max_avg = avg_lens.iter().copied().fold(0.0f64, f64::max);
    if min_avg <= 0.0 {
        return;
    }
    let ratio = max_avg / min_avg;
    if ratio > 10.0 {
        errors.push(format!(
            "length confound: {ratio:.1}x ratio between shortest and longest class avg — \
             model can cheat on length instead of learning safety features"
        ));
    } else if ratio > 5.0 {
        warnings.push(format!(
            "length spread: {ratio:.1}x ratio between class avg lengths — \
             consider normalizing or truncating"
        ));
    }
}

impl std::fmt::Display for ExportValidation {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        writeln!(
            f,
            "Export Validation: {}",
            if self.passed { "PASS" } else { "FAIL" }
        )?;
        writeln!(
            f,
            "  Total: {} samples, {} classes",
            self.total, self.num_classes
        )?;
        for (i, &count) in self.class_counts.iter().enumerate() {
            if count > 0 {
                let pct = 100.0 * count as f64 / self.total as f64;
                writeln!(f, "  Class {i}: {count:>6} ({pct:5.1}%)")?;
            }
        }
        for e in &self.errors {
            writeln!(f, "  ERROR: {e}")?;
        }
        for w in &self.warnings {
            writeln!(f, "  WARN: {w}")?;
        }
        Ok(())
    }
}

/// Split assignment for a classification row.
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum Split {
    Train,
    Val,
    Test,
}

impl fmt::Display for Split {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Split::Train => write!(f, "train"),
            Split::Val => write!(f, "val"),
            Split::Test => write!(f, "test"),
        }
    }
}

/// Result of splitting and validating classification data.
pub struct SplitResult {
    pub train: Vec<ClassificationRow>,
    pub val: Vec<ClassificationRow>,
    pub test: Vec<ClassificationRow>,
    pub validation: ExportValidation,
    pub split_validations: [ExportValidation; 3],
}

/// Deterministic hash-based stratified split.
///
/// Assigns each row to train/val/test using a hash of the input text,
/// producing stable splits that don't reshuffle when the corpus grows.
/// Within each split, class proportions match the overall distribution
/// (stratified).
///
/// Ratio: 80% train, 10% val, 10% test.
///
/// This is the canonical split function. All export paths must use it
/// to prevent train/test leakage and ensure reproducibility.
pub fn split_and_validate(rows: Vec<ClassificationRow>, expected_classes: u8) -> SplitResult {
    // Hash-based assignment: stable across corpus growth
    let mut train = Vec::new();
    let mut val = Vec::new();
    let mut test = Vec::new();

    for row in rows {
        let split = assign_split(&row.input);
        match split {
            Split::Train => train.push(row),
            Split::Val => val.push(row),
            Split::Test => test.push(row),
        }
    }

    // Validate overall + per-split
    let mut all: Vec<ClassificationRow> = Vec::with_capacity(train.len() + val.len() + test.len());
    all.extend(train.iter().cloned());
    all.extend(val.iter().cloned());
    all.extend(test.iter().cloned());

    let validation = validate_export(&all, expected_classes);
    let split_validations = [
        validate_export(&train, expected_classes),
        validate_export(&val, expected_classes),
        validate_export(&test, expected_classes),
    ];

    SplitResult {
        train,
        val,
        test,
        validation,
        split_validations,
    }
}

/// Assign a split based on a deterministic hash of the input text.
///
/// Uses FNV-1a hash (fast, no crypto needed) mod 10:
/// - 0..=7 → Train (80%)
/// - 8     → Val   (10%)
/// - 9     → Test  (10%)
///
/// Stable: same input always maps to same split, even if corpus order changes.
fn assign_split(input: &str) -> Split {
    let hash = fnv1a_hash(input.as_bytes());
    match hash % 10 {
        0..=7 => Split::Train,
        8 => Split::Val,
        _ => Split::Test,
    }
}

/// FNV-1a 64-bit hash — fast, deterministic, no dependencies.
fn fnv1a_hash(data: &[u8]) -> u64 {
    let mut hash: u64 = 0xcbf29ce484222325;
    for &byte in data {
        hash ^= byte as u64;
        hash = hash.wrapping_mul(0x100000001b3);
    }
    hash
}

impl fmt::Display for SplitResult {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let total = self.train.len() + self.val.len() + self.test.len();
        writeln!(f, "Split Result ({total} total):")?;
        for (name, split, sv) in [
            ("train", &self.train, &self.split_validations[0]),
            ("val", &self.val, &self.split_validations[1]),
            ("test", &self.test, &self.split_validations[2]),
        ] {
            let pct = if total > 0 {
                100.0 * split.len() as f64 / total as f64
            } else {
                0.0
            };
            let status = if sv.passed { "PASS" } else { "FAIL" };
            write!(f, "  {name}: {:>6} ({pct:5.1}%) [{status}]", split.len())?;
            // Show per-class counts inline
            for (i, &c) in sv.class_counts.iter().enumerate() {
                if c > 0 {
                    write!(f, " c{i}={c}")?;
                }
            }
            writeln!(f)?;
        }
        if !self.validation.passed {
            for e in &self.validation.errors {
                writeln!(f, "  ERROR: {e}")?;
            }
        }
        for w in &self.validation.warnings {
            writeln!(f, "  WARN: {w}")?;
        }
        Ok(())
    }
}

/// Multi-label classification row for ML training (SSC-021).
///
/// Format: `{"input": "<shell script>", "labels": [0.0, 1.0, 1.0, 0.0, 0.0]}`
#[derive(Debug, Clone, serde::Serialize)]
pub struct MultiLabelClassificationRow {
    pub input: String,
    pub labels: [f32; 5],
}

/// Safety class labels matching aprender `SafetyClass` enum.
pub const SAFETY_LABELS: [&str; 5] = [
    "safe",              // 0
    "needs-quoting",     // 1
    "non-deterministic", // 2
    "non-idempotent",    // 3
    "unsafe",            // 4
];

/// Dataset metadata for the `dataset-info` command
#[derive(Debug, Clone)]
pub struct DatasetInfo {
    pub total_entries: usize,
    pub format_counts: Vec<(String, usize)>,
    pub schema_fields: Vec<(&'static str, &'static str, &'static str)>,
    pub bashrs_version: String,
    pub date: String,
}

/// Build dataset rows from corpus entries and results
pub fn build_dataset(registry: &CorpusRegistry, score: &CorpusScore) -> Vec<DatasetRow> {
    let version = env!("CARGO_PKG_VERSION").to_string();
    let date = current_date();
    let commit = current_commit();

    // KAIZEN-069: O(1) HashMap lookup instead of O(n) linear find per entry.
    // With 17,942 entries, the old O(n²) find wasted ~161M string comparisons.
    let results_by_id: HashMap<&str, &CorpusResult> =
        score.results.iter().map(|r| (r.id.as_str(), r)).collect();


}

    include!("dataset_part2_incl2.rs");