use crate::corpus::registry::{CorpusEntry, CorpusFormat, CorpusRegistry};
use crate::corpus::runner::{CorpusResult, CorpusRunner, CorpusScore};
use crate::Config;
use std::collections::HashMap;
use std::fmt;
pub fn derive_safety_label(
shell_output: &str,
transpiled: bool,
lint_clean: bool,
deterministic: bool,
) -> u8 {
if !transpiled || !lint_clean {
return 4;
}
if !deterministic {
return 2;
}
if has_non_idempotent_pattern(shell_output) {
return 3;
}
if has_unquoted_variable(shell_output) {
return 1;
}
0
}
pub fn has_non_idempotent_pattern(script: &str) -> bool {
for line in script.lines() {
let trimmed = line.trim();
if trimmed.is_empty() || trimmed.starts_with('#') {
continue;
}
if trimmed.starts_with("mkdir ") && !trimmed.contains("-p") {
return true;
}
if trimmed.starts_with("rm ") && !trimmed.contains("-f") && !trimmed.contains("-rf") {
return true;
}
if trimmed.starts_with("ln ")
&& trimmed.contains("-s")
&& !trimmed.contains("-sf")
&& !trimmed.contains("-f")
{
return true;
}
}
false
}
pub fn has_unquoted_variable(script: &str) -> bool {
for line in script.lines() {
let trimmed = line.trim();
if trimmed.is_empty() || trimmed.starts_with('#') {
continue;
}
if line_has_unquoted_var(trimmed) {
return true;
}
}
false
}
fn line_has_unquoted_var(line: &str) -> bool {
let bytes = line.as_bytes();
let mut in_double_quotes = false;
let mut in_single_quotes = false;
let mut i = 0;
while i < bytes.len() {
let b = bytes[i];
if b == b'\'' && !in_double_quotes {
in_single_quotes = !in_single_quotes;
i += 1;
continue;
}
if b == b'"' && !in_single_quotes {
in_double_quotes = !in_double_quotes;
i += 1;
continue;
}
if b == b'\\' && i + 1 < bytes.len() {
i += 2;
continue;
}
if b == b'$' && !in_single_quotes && !in_double_quotes {
if i + 1 < bytes.len() {
let next = bytes[i + 1];
if next.is_ascii_alphabetic() || next == b'_' || next == b'{' {
return true;
}
}
}
i += 1;
}
false
}
fn score_to_grade(score: f64) -> String {
match score as u32 {
97..=100 => "A+",
93..=96 => "A",
90..=92 => "A-",
87..=89 => "B+",
83..=86 => "B",
80..=82 => "B-",
77..=79 => "C+",
73..=76 => "C",
70..=72 => "C-",
60..=69 => "D",
_ => "F",
}
.to_string()
}
pub fn export_jsonl(rows: &[DatasetRow]) -> String {
rows.iter()
.filter_map(|row| serde_json::to_string(row).ok())
.collect::<Vec<_>>()
.join("\n")
}
pub fn export_classification_jsonl(rows: &[DatasetRow]) -> String {
rows.iter()
.map(|row| {
let cr = classify_single(
&row.input_rust,
row.transpiled,
row.lint_clean,
row.deterministic,
);
serde_json::to_string(&cr).unwrap_or_default()
})
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join("\n")
}
pub fn classify_single(
original_input: &str,
transpiled: bool,
lint_clean: bool,
deterministic: bool,
) -> ClassificationRow {
let label = if transpiled && lint_clean && deterministic {
0
} else {
1
};
ClassificationRow {
input: strip_shell_preamble(original_input),
label,
}
}
pub fn strip_shell_preamble(script: &str) -> String {
let body: Vec<&str> = script
.lines()
.filter(|line| {
let s = line.trim();
!is_shell_preamble(s) && s != "main() {" && s != "}" && s != "'"
})
.map(|line| {
let trimmed = line.trim_start();
if trimmed.is_empty() {
line
} else {
trimmed
}
})
.collect();
if body.is_empty() {
return script.to_string();
}
body.join("\n")
}
pub fn is_shell_preamble(s: &str) -> bool {
s.is_empty()
|| s.starts_with('#')
|| s.starts_with("set ")
|| s.starts_with("IFS=")
|| s.starts_with("export ")
|| s.starts_with("trap ")
|| s == "main \"$@\""
}
pub fn export_multi_label_classification_jsonl(rows: &[DatasetRow]) -> String {
rows.iter()
.filter(|row| row.transpiled)
.map(|row| {
let labels = derive_multi_label(
&row.actual_output,
row.transpiled,
row.lint_clean,
row.deterministic,
);
let ml = MultiLabelClassificationRow {
input: strip_shell_preamble(&row.actual_output),
labels,
};
serde_json::to_string(&ml).unwrap_or_default()
})
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join("\n")
}
pub fn derive_multi_label(
shell_output: &str,
transpiled: bool,
lint_clean: bool,
deterministic: bool,
) -> [f32; 5] {
let mut labels = [0.0f32; 5];
if !transpiled || !lint_clean {
labels[4] = 1.0;
}
if !deterministic {
labels[2] = 1.0;
}
if has_non_idempotent_pattern(shell_output) {
labels[3] = 1.0;
}
if has_unquoted_variable(shell_output) {
labels[1] = 1.0;
}
if labels.iter().all(|&v| v < 0.5) {
labels[0] = 1.0;
}
labels
}
pub fn export_json(rows: &[DatasetRow]) -> String {
serde_json::to_string_pretty(rows).unwrap_or_else(|_| "[]".to_string())
}
pub fn export_csv(rows: &[DatasetRow]) -> String {
let mut out = String::new();
out.push_str("id,name,tier,format,transpiled,output_correct,lint_clean,deterministic,score,grade,safety_index,safety_label,bashrs_version,date\n");
for row in rows {
out.push_str(&format!(
"{},{},{},{},{},{},{},{},{:.1},{},{},{},{},{}\n",
csv_escape(&row.id),
csv_escape(&row.name),
row.tier,
row.format,
row.transpiled,
row.output_correct,
row.lint_clean,
row.deterministic,
row.score,
row.grade,
row.safety_index,
row.safety_label,
row.bashrs_version,
row.date,
));
}
out
}
fn csv_escape(s: &str) -> String {
if s.contains(',') || s.contains('"') || s.contains('\n') {
format!("\"{}\"", s.replace('"', "\"\""))
} else {
s.to_string()
}
}
pub fn dataset_info(registry: &CorpusRegistry) -> DatasetInfo {
let mut format_counts: Vec<(String, usize)> = Vec::new();
for fmt in &[
CorpusFormat::Bash,
CorpusFormat::Makefile,
CorpusFormat::Dockerfile,
] {
let count = registry.entries.iter().filter(|e| e.format == *fmt).count();
format_counts.push((fmt.to_string(), count));
}
DatasetInfo {
total_entries: registry.entries.len(),
format_counts,
schema_fields: dataset_schema_fields(),
bashrs_version: env!("CARGO_PKG_VERSION").to_string(),
date: current_date(),
}
}
fn dataset_schema_fields() -> Vec<(&'static str, &'static str, &'static str)> {
vec![
("id", "string", "Entry ID (B-001, M-042, D-015)"),
("name", "string", "Human-readable name"),
("tier", "int32", "Difficulty tier (1-5)"),
("format", "string", "bash, makefile, dockerfile"),
("input_rust", "string", "Rust DSL source code"),
("expected_output", "string", "Ground truth expected output"),
("actual_output", "string", "Transpiler actual output"),
("transpiled", "bool", "Transpilation succeeded?"),
("output_correct", "bool", "Output matches expected?"),
("lint_clean", "bool", "Output passes linter?"),
("deterministic", "bool", "Output identical across runs?"),
("score", "float64", "Per-entry score (0-100)"),
("grade", "string", "A+, A, B, C, D, F"),
("safety_index", "uint8", "Safety class (0=safe..4=unsafe)"),
("safety_label", "string", "Safety class label"),
("bashrs_version", "string", "e.g. 6.61.0"),
("commit_sha", "string", "Git commit SHA"),
("date", "string", "ISO 8601 date"),
]
}
pub fn format_dataset_info(info: &DatasetInfo) -> String {
let mut out = String::new();
let line = "\u{2500}".repeat(64);
out.push_str(&format!(
"bashrs v{} \u{2014} {}\n\n",
info.bashrs_version, info.date
));
out.push_str(&format!("Corpus: {} entries\n", info.total_entries));
for (fmt, count) in &info.format_counts {
out.push_str(&format!(" {:<14} {} entries\n", fmt, count));
}
}
include!("dataset_part3_incl2.rs");