bashrs 6.66.0

Rust-to-Shell transpiler for deterministic bootstrap scripts
fn line_has_unquoted_var(line: &str) -> bool {
    let bytes = line.as_bytes();
    let mut in_double_quotes = false;
    let mut in_single_quotes = false;
    let mut i = 0;

    while i < bytes.len() {
        let b = bytes[i];

        // Track quote state
        if b == b'\'' && !in_double_quotes {
            in_single_quotes = !in_single_quotes;
            i += 1;
            continue;
        }
        if b == b'"' && !in_single_quotes {
            in_double_quotes = !in_double_quotes;
            i += 1;
            continue;
        }

        // Skip escaped characters
        if b == b'\\' && i + 1 < bytes.len() {
            i += 2;
            continue;
        }

        // Check for $ outside quotes
        if b == b'$' && !in_single_quotes && !in_double_quotes {
            // Check if followed by alphanumeric/underscore or {
            if i + 1 < bytes.len() {
                let next = bytes[i + 1];
                if next.is_ascii_alphabetic() || next == b'_' || next == b'{' {
                    return true;
                }
            }
        }

        i += 1;
    }
    false
}

fn score_to_grade(score: f64) -> String {
    match score as u32 {
        97..=100 => "A+",
        93..=96 => "A",
        90..=92 => "A-",
        87..=89 => "B+",
        83..=86 => "B",
        80..=82 => "B-",
        77..=79 => "C+",
        73..=76 => "C",
        70..=72 => "C-",
        60..=69 => "D",
        _ => "F",
    }
    .to_string()
}

/// Export dataset rows as JSON Lines
pub fn export_jsonl(rows: &[DatasetRow]) -> String {
    rows.iter()
        .filter_map(|row| serde_json::to_string(row).ok())
        .collect::<Vec<_>>()
        .join("\n")
}

/// Export classification JSONL for entrenar fine-tuning.
///
/// Output format: `{"input":"<shell script>","label":N}` per line.
/// Uses `actual_output` (transpiled shell) as the input text and
/// `safety_index` as the label. Only includes entries that were
/// successfully transpiled.
///
/// Shell preamble (shebang, `set -euf`, `IFS=`, `export`, `trap`, `main "$@"`)
/// is stripped by default to remove noise that confuses classifiers.
/// The `trap '... $$'` pattern in particular contains a non-deterministic
/// signal (`$$` = process ID) that is present in every transpiled script,
/// causing safe scripts to be misclassified as non-deterministic.
pub fn export_classification_jsonl(rows: &[DatasetRow]) -> String {
    rows.iter()
        .map(|row| {
            let cr = classify_single(
                &row.input_rust,
                row.transpiled,
                row.lint_clean,
                row.deterministic,
            );
            serde_json::to_string(&cr).unwrap_or_default()
        })
        .filter(|s| !s.is_empty())
        .collect::<Vec<_>>()
        .join("\n")
}

/// Build a single binary classification row for ML training.
///
/// This is the canonical single-entry classification path.
/// Both `export_classification_jsonl` (batch) and `fast_classify_export` (streaming)
/// must use this to avoid divergent preamble-stripping or labeling logic.
///
/// **Binary classification** (safe=0, unsafe=1):
/// - Safe (0): transpiled successfully AND lint-clean AND deterministic
/// - Unsafe (1): any of those checks failed
///
/// **Model input**: the original script (what users feed at inference time),
/// with shell preamble stripped. NOT the transpiled output — users don't have
/// transpiled output at inference time.
///
/// - `original_input`: raw bash/makefile/dockerfile (used as model input text)
/// - `transpiled`: whether transpilation succeeded
/// - `lint_clean`: whether the output passed lint checks
/// - `deterministic`: whether two transpilations produce identical output
pub fn classify_single(
    original_input: &str,
    transpiled: bool,
    lint_clean: bool,
    deterministic: bool,
) -> ClassificationRow {
    let label = if transpiled && lint_clean && deterministic {
        0
    } else {
        1
    };
    ClassificationRow {
        input: strip_shell_preamble(original_input),
        label,
    }
}

/// Strip shell preamble lines from transpiled output.
///
/// Removes boilerplate that is identical across all transpiled scripts
/// and adds no discriminative signal for classification:
/// - Shebang (`#!/bin/sh`)
/// - Shell options (`set -euf`)
/// - IFS reset (`IFS=' \t\n'`)
/// - Locale export (`export LC_ALL=C`)
/// - Trap cleanup (`trap 'rm -rf ...' EXIT`)
/// - Function wrappers (`main() {`, `}`, `main "$@"`)
/// - Generated comments (`# Generated by Rash`, `# POSIX-compliant`, etc.)
///
/// Returns only the meaningful body lines joined by newlines.
/// If stripping produces an empty string, returns the original input unchanged.
pub fn strip_shell_preamble(script: &str) -> String {
    let body: Vec<&str> = script
        .lines()
        .filter(|line| {
            let s = line.trim();
            // Filter preamble lines + structural wrappers.
            // s == "'" catches the closing quote of multi-line IFS=' \t\n'
            !is_shell_preamble(s) && s != "main() {" && s != "}" && s != "'"
        })
        .map(|line| {
            // Dedent: strip leading whitespace from lines inside main() { ... }
            let trimmed = line.trim_start();
            if trimmed.is_empty() {
                line
            } else {
                trimmed
            }
        })
        .collect();

    if body.is_empty() {
        // Don't produce empty inputs — fall back to original
        return script.to_string();
    }

    body.join("\n")
}

/// Return true if this trimmed line is shell preamble (not user code).
///
/// Used by `strip_shell_preamble` and corpus B2 commands to identify
/// transpiler boilerplate that should be excluded from classification input.
///
/// Note: does NOT match structural markers like `main() {` or `}` — those
/// are needed by `extract_bash_main_body` for state tracking. The
/// `strip_shell_preamble` function handles those separately.
pub fn is_shell_preamble(s: &str) -> bool {
    s.is_empty()
        || s.starts_with('#')
        || s.starts_with("set ")
        || s.starts_with("IFS=")
        || s.starts_with("export ")
        || s.starts_with("trap ")
        || s == "main \"$@\""
}

/// Export corpus as multi-label classification JSONL (SSC-021).
///
/// Each row has ALL applicable labels as a multi-hot vector, not just the primary one.
/// Output: `{"input":"...","labels":[0.0, 1.0, 1.0, 0.0, 0.0]}`
///
/// Shell preamble is stripped (same as single-label export).
pub fn export_multi_label_classification_jsonl(rows: &[DatasetRow]) -> String {
    rows.iter()
        .filter(|row| row.transpiled)
        .map(|row| {
            let labels = derive_multi_label(
                &row.actual_output,
                row.transpiled,
                row.lint_clean,
                row.deterministic,
            );
            let ml = MultiLabelClassificationRow {
                input: strip_shell_preamble(&row.actual_output),
                labels,
            };
            serde_json::to_string(&ml).unwrap_or_default()
        })
        .filter(|s| !s.is_empty())
        .collect::<Vec<_>>()
        .join("\n")
}

/// Derive multi-hot label vector (SSC-021).
///
/// Unlike `derive_safety_label` which picks ONE priority class, this detects
/// ALL applicable classes independently:
/// - Class 0 (safe): no issues at all
/// - Class 1 (needs-quoting): has unquoted variable
/// - Class 2 (non-deterministic): not deterministic
/// - Class 3 (non-idempotent): has non-idempotent patterns
/// - Class 4 (unsafe): not lint_clean or not transpiled
pub fn derive_multi_label(
    shell_output: &str,
    transpiled: bool,
    lint_clean: bool,
    deterministic: bool,
) -> [f32; 5] {
    let mut labels = [0.0f32; 5];

    // Class 4: unsafe
    if !transpiled || !lint_clean {
        labels[4] = 1.0;
    }

    // Class 2: non-deterministic
    if !deterministic {
        labels[2] = 1.0;
    }

    // Class 3: non-idempotent
    if has_non_idempotent_pattern(shell_output) {
        labels[3] = 1.0;
    }

    // Class 1: needs-quoting
    if has_unquoted_variable(shell_output) {
        labels[1] = 1.0;
    }

    // Class 0: safe (only if nothing else is active)
    if labels.iter().all(|&v| v < 0.5) {
        labels[0] = 1.0;
    }

    labels
}

include!("dataset_export.rs");