use crate::corpus::baselines::corpus_baseline_entries;
use crate::corpus::dataset::{split_and_validate, ClassificationRow};
use std::fmt::Write as _;
pub fn generate_model_card() -> String {
let owned = corpus_baseline_entries();
let total = owned.len();
let safe_count = owned.iter().filter(|(_, l)| *l == 0).count();
let unsafe_count = owned.iter().filter(|(_, l)| *l == 1).count();
let rows: Vec<ClassificationRow> = owned
.into_iter()
.map(|(input, label)| ClassificationRow { input, label })
.collect();
let split = split_and_validate(rows, 2);
let train_len = split.train.len();
let val_len = split.val.len();
let test_len = split.test.len();
let train_unsafe = split.train.iter().filter(|r| r.label == 1).count();
let val_unsafe = split.val.iter().filter(|r| r.label == 1).count();
let test_unsafe = split.test.iter().filter(|r| r.label == 1).count();
let unsafe_pct = if total > 0 {
100.0 * unsafe_count as f64 / total as f64
} else {
0.0
};
let class_weight_safe = compute_class_weight(safe_count, total);
let class_weight_unsafe = compute_class_weight(unsafe_count, total);
let mut card = String::with_capacity(8192);
write_yaml_front_matter(&mut card, total, unsafe_pct);
write_header(&mut card);
write_model_description(&mut card);
write_dataset_section(&mut card, total, safe_count, unsafe_count, unsafe_pct);
write_splits_section(
&mut card,
train_len,
val_len,
test_len,
train_unsafe,
val_unsafe,
test_unsafe,
);
write_class_weights_section(&mut card, class_weight_safe, class_weight_unsafe);
write_baselines_section(&mut card);
write_intended_use(&mut card);
write_limitations(&mut card);
write_honesty_section(&mut card);
write_citation(&mut card);
card
}
fn compute_class_weight(class_count: usize, total: usize) -> f64 {
if class_count == 0 || total == 0 {
return 1.0;
}
let freq = class_count as f64 / total as f64;
(1.0 / freq).sqrt()
}
fn write_yaml_front_matter(card: &mut String, total: usize, unsafe_pct: f64) {
let _ = writeln!(card, "---");
let _ = writeln!(card, "language:");
let _ = writeln!(card, "- en");
let _ = writeln!(card, "license: apache-2.0");
let _ = writeln!(card, "tags:");
let _ = writeln!(card, "- shell-safety");
let _ = writeln!(card, "- bash");
let _ = writeln!(card, "- security");
let _ = writeln!(card, "- code-analysis");
let _ = writeln!(card, "- binary-classification");
let _ = writeln!(card, "task_categories:");
let _ = writeln!(card, "- text-classification");
let _ = writeln!(card, "size_categories:");
let _ = writeln!(
card,
"- {}",
if total > 10000 {
"10K<n<100K"
} else {
"1K<n<10K"
}
);
let _ = writeln!(card, "dataset_info:");
let _ = writeln!(card, " features:");
let _ = writeln!(card, " - name: input");
let _ = writeln!(card, " dtype: string");
let _ = writeln!(card, " - name: label");
let _ = writeln!(card, " dtype: int32");
let _ = writeln!(card, " splits:");
let _ = writeln!(card, " - name: train");
let _ = writeln!(card, " - name: validation");
let _ = writeln!(card, " - name: test");
let _ = writeln!(card, "configs:");
let _ = writeln!(card, "- config_name: default");
let _ = writeln!(card, " data_files:");
let _ = writeln!(card, " - split: train");
let _ = writeln!(card, " path: train.jsonl");
let _ = writeln!(card, " - split: validation");
let _ = writeln!(card, " path: val.jsonl");
let _ = writeln!(card, " - split: test");
let _ = writeln!(card, " path: test.jsonl");
let _ = writeln!(card, "annotations_creators:");
let _ = writeln!(card, "- machine-generated");
let _ = writeln!(card, "source_datasets:");
let _ = writeln!(card, "- original");
let _ = writeln!(card, "pretty_name: Shell Safety Classification Dataset");
let _ = writeln!(
card,
"class_distribution: \"safe={:.1}%, unsafe={:.1}%\"",
100.0 - unsafe_pct,
unsafe_pct
);
let _ = writeln!(card, "---");
let _ = writeln!(card);
}
fn write_header(card: &mut String) {
let _ = writeln!(card, "# Shell Safety Classification Dataset");
let _ = writeln!(card);
let _ = writeln!(
card,
"Binary safety classification of shell scripts: **safe** (0) vs **unsafe** (1)."
);
let _ = writeln!(card);
let _ = writeln!(
card,
"First-of-its-kind dataset for shell script safety analysis. Generated by the"
);
let _ = writeln!(
card,
"[bashrs](https://crates.io/crates/bashrs) transpilation corpus and rule-based linter."
);
let _ = writeln!(card);
}
fn write_model_description(card: &mut String) {
let _ = writeln!(card, "## Model Description");
let _ = writeln!(card);
let _ = writeln!(
card,
"This dataset supports the SSC v11 (Shell Safety Classifier) pipeline:"
);
let _ = writeln!(card);
let _ = writeln!(card, "| Stage | Model | Purpose | Latency |");
let _ = writeln!(card, "|-------|-------|---------|---------|");
let _ = writeln!(
card,
"| 0 | Rule-based linter | Known pattern detection | <1ms |"
);
let _ = writeln!(
card,
"| 1 | CodeBERT (125M) | Binary classification | ~20ms |"
);
let _ = writeln!(
card,
"| 2 | Qwen-1.5B + LoRA | Explain + suggest fixes | ~2s |"
);
let _ = writeln!(card);
}
fn write_dataset_section(
card: &mut String,
total: usize,
safe: usize,
unsafe_count: usize,
unsafe_pct: f64,
) {
let _ = writeln!(card, "## Dataset");
let _ = writeln!(card);
let _ = writeln!(card, "| Property | Value |");
let _ = writeln!(card, "|----------|-------|");
let _ = writeln!(card, "| Total entries | {total} |");
let _ = writeln!(card, "| Safe (label=0) | {safe} |");
let _ = writeln!(card, "| Unsafe (label=1) | {unsafe_count} |");
let _ = writeln!(card, "| Class imbalance | {unsafe_pct:.1}% unsafe |");
let _ = writeln!(card, "| Input format | Shell scripts (bash/sh) |");
let _ = writeln!(
card,
"| Label source | Rule-based linter (24 SEC + DET/IDEM rules) |"
);
let _ = writeln!(
card,
"| Preamble | Stripped (shebang, set -euf, trap, etc.) |"
);
let _ = writeln!(card);
}
fn write_splits_section(
card: &mut String,
train: usize,
val: usize,
test: usize,
train_unsafe: usize,
val_unsafe: usize,
test_unsafe: usize,
) {
let total = train + val + test;
let _ = writeln!(card, "### Splits");
let _ = writeln!(card);
let _ = writeln!(card, "Deterministic hash-based split (FNV-1a, 80/10/10):");
let _ = writeln!(card);
let _ = writeln!(card, "| Split | Total | Safe | Unsafe | %Unsafe |");
let _ = writeln!(card, "|-------|------:|-----:|-------:|--------:|");
let _ = writeln!(
card,
"| Train | {train} | {} | {train_unsafe} | {:.1}% |",
train - train_unsafe,
if train > 0 {
100.0 * train_unsafe as f64 / train as f64
} else {
0.0
}
);
let _ = writeln!(
card,
"| Val | {val} | {} | {val_unsafe} | {:.1}% |",
val - val_unsafe,
if val > 0 {
100.0 * val_unsafe as f64 / val as f64
} else {
0.0
}
);
let _ = writeln!(
card,
"| Test | {test} | {} | {test_unsafe} | {:.1}% |",
test - test_unsafe,
if test > 0 {
100.0 * test_unsafe as f64 / test as f64
} else {
0.0
}
);
let total_unsafe = train_unsafe + val_unsafe + test_unsafe;
let _ = writeln!(
card,
"| **Total** | **{total}** | **{}** | **{total_unsafe}** | **{:.1}%** |",
total - total_unsafe,
if total > 0 {
100.0 * total_unsafe as f64 / total as f64
} else {
0.0
}
);
let _ = writeln!(card);
}
fn write_class_weights_section(card: &mut String, w_safe: f64, w_unsafe: f64) {
let _ = writeln!(card, "### Class Weights");
let _ = writeln!(card);
let _ = writeln!(
card,
"Recommended sqrt-inverse weights for imbalanced training:"
);
let _ = writeln!(card);
let _ = writeln!(card, "| Class | Weight |");
let _ = writeln!(card, "|-------|-------:|");
let _ = writeln!(card, "| safe (0) | {w_safe:.3} |");
let _ = writeln!(card, "| unsafe (1) | {w_unsafe:.3} |");
let _ = writeln!(card);
}
fn write_baselines_section(card: &mut String) {
let _ = writeln!(card, "## Baselines");
let _ = writeln!(card);
let _ = writeln!(card, "Any ML classifier must beat at least one baseline:");
let _ = writeln!(card);
let _ = writeln!(card, "| Baseline | MCC | Accuracy | Precision | Recall |");
let _ = writeln!(card, "|----------|----:|--------:|---------:|------:|");
let _ = writeln!(
card,
"| Majority (all-safe) | 0.000 | ~{:.1}% | 0.0% | 0.0% |",
93.5
);
let _ = writeln!(card, "| Keyword regex | ~0.030 | ~97% | ~7% | ~2.5% |");
let _ = writeln!(
card,
"| bashrs linter (24 rules) | **1.000** | **100%** | **100%** | **100%** |"
);
let _ = writeln!(card);
let _ = writeln!(
card,
"> **Note**: The linter achieves perfect MCC because labels are derived FROM"
);
let _ = writeln!(
card,
"> linter output. The ML classifier's value is generalization to novel patterns"
);
let _ = writeln!(card, "> that the linter rules don't cover.");
let _ = writeln!(card);
}
fn write_intended_use(card: &mut String) {
let _ = writeln!(card, "## Intended Use");
let _ = writeln!(card);
let _ = writeln!(
card,
"- **Primary**: Train a binary classifier (CodeBERT 125M) for shell safety triage"
);
let _ = writeln!(
card,
"- **Secondary**: Fine-tune a chat model (Qwen-1.5B) for safety explanations"
);
let _ = writeln!(
card,
"- **CI/CD**: Automated shell script safety checks in build pipelines"
);
let _ = writeln!(
card,
"- **Education**: Learn about shell script safety patterns"
);
let _ = writeln!(card);
}
fn write_limitations(card: &mut String) {
let _ = writeln!(card, "## Limitations");
let _ = writeln!(card);
let _ = writeln!(
card,
"1. **Labels are linter-derived, not expert-audited**: An unsafe label means the"
);
let _ = writeln!(
card,
" bashrs linter flagged the script (SEC/DET rules), not that a human security"
);
let _ = writeln!(card, " expert reviewed it.");
let _ = writeln!(
card,
"2. **Class imbalance**: ~2% unsafe examples. Use class weights for training."
);
let _ = writeln!(
card,
"3. **Corpus-bound**: Scripts are from the bashrs transpilation corpus."
);
let _ = writeln!(
card,
" Real-world bash scripts may differ in style, length, and complexity."
);
let _ = writeln!(
card,
"4. **Not a security audit**: This dataset and any model trained on it"
);
let _ = writeln!(card, " detect known patterns, not novel vulnerabilities.");
let _ = writeln!(card);
}
fn write_honesty_section(card: &mut String) {
let _ = writeln!(card, "## Honesty Requirements (SSC v11 S6.5)");
let _ = writeln!(card);
let _ = writeln!(
card,
"Per the SSC specification, the following must be stated clearly:"
);
let _ = writeln!(card);
let _ = writeln!(
card,
"- Trained on **synthetic data derived from rule-based linter output**"
);
let _ = writeln!(
card,
"- Explains **known patterns**, not novel safety reasoning"
);
let _ = writeln!(
card,
"- For scripts outside rule coverage, responses may be **generic**"
);
let _ = writeln!(card, "- **Not a replacement for security audit**");
let _ = writeln!(card);
}
fn write_citation(card: &mut String) {
let _ = writeln!(card, "## Citation");
let _ = writeln!(card);
let _ = writeln!(card, "```bibtex");
let _ = writeln!(card, "@software{{bashrs,");
let _ = writeln!(
card,
" title = {{bashrs: Shell Safety and Purification Tool}},"
);
let _ = writeln!(card, " author = {{paiml engineering}},");
let _ = writeln!(card, " year = {{2026}},");
let _ = writeln!(card, " url = {{https://crates.io/crates/bashrs}}");
let _ = writeln!(card, "}}");
let _ = writeln!(card, "```");
}
#[cfg(test)]
#[path = "model_card_tests_generate_mod.rs"]
mod tests_extracted;