fn generate_confirm_safe(input: &ConversationInput<'_>, variant: usize) -> Vec<Turn> {
let prompt_idx = variant % SAFE_PROMPTS.len();
let user_content = format!(
"{}\n\n```bash\n{}\n```",
SAFE_PROMPTS[prompt_idx], input.script
);
let openings = [
"This script looks safe.",
"This script appears to be well-written and safe.",
"I don't see any security issues in this script.",
"This script follows good practices.",
"This script is safe to run.",
"No security concerns found in this script.",
"This script looks good.",
"This is a clean, safe script.",
"No issues detected in this script.",
"This script appears production-ready.",
];
let opening_idx = variant % openings.len();
let mut response = String::from(openings[opening_idx]);
response.push_str(" It doesn't contain known unsafe patterns like ");
response
.push_str("command injection, non-deterministic operations, or non-idempotent commands.");
vec![
system_turn(),
Turn {
role: "user",
content: user_content,
},
Turn {
role: "assistant",
content: response,
},
]
}
fn system_turn() -> Turn {
Turn {
role: "system",
content: SYSTEM_PROMPT.to_string(),
}
}
fn apply_safety_fixes(script: &str, diagnostics: &[Diagnostic]) -> String {
let mut lines: Vec<String> = script.lines().map(|l| l.to_string()).collect();
for d in diagnostics {
let line_idx = d.span.start_line.saturating_sub(1);
if line_idx < lines.len() {
let line = &lines[line_idx];
let fixed = apply_single_fix(line, &d.code);
lines[line_idx] = fixed;
}
}
lines.join("\n")
}
fn apply_single_fix(line: &str, code: &str) -> String {
match code {
"IDEM001" => line.replace("mkdir ", "mkdir -p "),
"IDEM002" => line.replace("rm ", "rm -f "),
"IDEM003" => line.replace("ln -s ", "ln -sf "),
"SEC001" => format!("# REMOVED (unsafe): {line}"),
"SEC002" => format!("# REMOVED (unsafe): {line}"),
"DET001" => line.replace("$RANDOM", "42"),
"DET002" => line.replace("$(date)", "\"2026-01-01\""),
_ => line.to_string(),
}
}
fn check_variant_distribution(conversations: &[Conversation]) -> bool {
if conversations.is_empty() {
return true;
}
let mut variant_counts = std::collections::HashMap::new();
for conv in conversations {
if let Some(user_turn) = conv.turns.get(1) {
let key = user_turn.content.lines().next().unwrap_or("").to_string();
*variant_counts.entry(key).or_insert(0usize) += 1;
}
}
let total = conversations.len();
let max_pct = 0.20;
for count in variant_counts.values() {
if *count as f64 / total as f64 > max_pct {
return false;
}
}
true
}
pub fn to_jsonl(conversations: &[Conversation]) -> String {
let mut output = String::new();
for conv in conversations {
if let Ok(json) = serde_json::to_string(conv) {
output.push_str(&json);
output.push('\n');
}
}
output
}
pub fn to_entrenar_jsonl(conversations: &[Conversation]) -> String {
let mut output = String::new();
for conv in conversations {
let system = conv
.turns
.iter()
.find(|t| t.role == "system")
.map(|t| t.content.as_str())
.unwrap_or(SYSTEM_PROMPT);
let instruction = conv
.turns
.iter()
.find(|t| t.role == "user")
.map(|t| t.content.as_str())
.unwrap_or("");
let response = conv
.turns
.iter()
.find(|t| t.role == "assistant")
.map(|t| t.content.as_str())
.unwrap_or("");
if instruction.is_empty() || response.is_empty() {
continue;
}
let text = format!(
"<|im_start|>system\n{system}<|im_end|>\n\
<|im_start|>user\n{instruction}<|im_end|>\n\
<|im_start|>assistant\n{response}<|im_end|>"
);
let sample = serde_json::json!({
"text": text,
"instruction": instruction,
"response": response,
"system": system,
});
if let Ok(json) = serde_json::to_string(&sample) {
output.push_str(&json);
output.push('\n');
}
}
output
}
pub fn generate_dataset_readme(report: &QualityReport) -> String {
let mut readme = String::new();
let _ = write!(
readme,
"---\n\
language:\n\
- en\n\
license: apache-2.0\n\
task_categories:\n\
- text-classification\n\
- text-generation\n\
tags:\n\
- shell\n\
- bash\n\
- security\n\
- safety\n\
- code-analysis\n\
- synthetic\n\
size_categories:\n\
- 10K<n<100K\n\
---\n\n"
);
readme.push_str("# Shell Safety Conversations\n\n");
readme.push_str(
"Synthetic instruction-following conversations for shell script safety analysis. \
Generated from the bashrs corpus using rule-based linter findings.\n\n",
);
readme.push_str("## Dataset Summary\n\n");
let _ = writeln!(readme, "- **Total conversations**: {}", report.total);
let _ = writeln!(
readme,
"- **Type A (Classify+Explain)**: {}",
report.type_a_count
);
let _ = writeln!(readme, "- **Type B (Fix)**: {}", report.type_b_count);
let _ = writeln!(readme, "- **Type C (Debug)**: {}", report.type_c_count);
let _ = writeln!(
readme,
"- **Type D (Confirm Safe)**: {} ({:.1}%)",
report.type_d_count, report.type_d_pct
);
let _ = writeln!(readme, "- **Format**: ChatML (system + user + assistant)");
let _ = writeln!(
readme,
"- **Quality gate**: {}",
if report.passed { "PASSED" } else { "FAILED" }
);
readme.push('\n');
readme.push_str("## Limitations and Bias\n\n");
readme.push_str(
"This dataset is generated from **rule-based linter output**, not from human security experts \
or independent safety reasoning. The conversations:\n\n\
- Explain known unsafe patterns (SEC001-SEC024, DET001-DET006, IDEM001-IDEM006)\n\
- Do NOT perform novel security reasoning\n\
- May produce generic responses for scripts outside rule coverage\n\
- Are NOT a replacement for professional security audit\n\
- Use synthetic phrasing variants (12 per type) for diversity\n\n",
);
readme.push_str("## Data Format\n\n");
readme.push_str("Each entry is a JSON object with:\n\n");
readme.push_str("```json\n");
readme.push_str("{\n");
readme.push_str(" \"id\": \"conv-B-1234-classify-explain\",\n");
readme.push_str(" \"conversation_type\": \"ClassifyExplain\",\n");
readme.push_str(" \"turns\": [\n");
readme.push_str(
" {\"role\": \"system\", \"content\": \"You are a shell script safety analyzer...\"},\n",
);
readme.push_str(" {\"role\": \"user\", \"content\": \"Is this script safe?\\n\\n```bash\\neval $x\\n```\"},\n");
readme
.push_str(" {\"role\": \"assistant\", \"content\": \"This script is **unsafe**...\"}\n");
readme.push_str(" ]\n");
readme.push_str("}\n");
readme.push_str("```\n\n");
readme.push_str("## Source\n\n");
readme.push_str(
"Generated by `bashrs corpus generate-conversations` from the bashrs corpus \
(17,942 shell script entries). See [bashrs](https://github.com/paiml/bashrs) \
and the [SSC v11 specification](https://github.com/paiml/bashrs/blob/main/docs/specifications/shell-safety-inference.md).\n",
);
readme
}
#[cfg(test)]
#[path = "conversations_tests_generate_cla.rs"]
mod tests_extracted;