impl Default for CommandGenerator {
fn default() -> Self {
Self::new()
}
}
pub struct CommandMutator {
flag_subs: HashMap<&'static str, Vec<&'static str>>,
cmd_subs: HashMap<&'static str, Vec<&'static str>>,
}
impl CommandMutator {
#[must_use]
pub fn new() -> Self {
let mut flag_subs = HashMap::new();
flag_subs.insert("-m", vec!["-am", "--message", "-m"]);
flag_subs.insert("--release", vec!["--debug", ""]);
flag_subs.insert("--lib", vec!["--bin", "--doc", "--test", ""]);
flag_subs.insert("-v", vec!["-vv", "-vvv", "--verbose", ""]);
flag_subs.insert("-a", vec!["--all", ""]);
flag_subs.insert("-f", vec!["--force", ""]);
flag_subs.insert("-n", vec!["--dry-run", ""]);
flag_subs.insert("-i", vec!["--interactive", ""]);
flag_subs.insert("-r", vec!["-R", "--recursive", ""]);
let mut cmd_subs = HashMap::new();
cmd_subs.insert("commit", vec!["add", "status", "diff", "log"]);
cmd_subs.insert("push", vec!["pull", "fetch"]);
cmd_subs.insert("test", vec!["build", "check", "run", "bench"]);
cmd_subs.insert("install", vec!["uninstall", "update", "add", "remove"]);
cmd_subs.insert("start", vec!["stop", "restart", "status"]);
cmd_subs.insert("up", vec!["down", "restart", "logs"]);
cmd_subs.insert("create", vec!["delete", "update", "get", "describe"]);
Self {
flag_subs,
cmd_subs,
}
}
pub fn mutate(&self, command: &str) -> Vec<String> {
let parts: Vec<&str> = command.split_whitespace().collect();
if parts.is_empty() {
return Vec::new();
}
let mut mutations = Vec::new();
self.mutate_subcommand(&parts, &mut mutations);
self.mutate_flags(&parts, command, &mut mutations);
Self::append_common_variations(command, &mut mutations);
mutations.sort();
mutations.dedup();
mutations
}
fn mutate_subcommand(&self, parts: &[&str], mutations: &mut Vec<String>) {
if parts.len() < 2 {
return;
}
let Some(subs) = self.cmd_subs.get(parts[1]) else {
return;
};
for sub in subs {
let mut new_parts = parts.to_vec();
new_parts[1] = sub;
mutations.push(new_parts.join(" "));
}
}
fn mutate_flags(&self, parts: &[&str], command: &str, mutations: &mut Vec<String>) {
for (i, part) in parts.iter().enumerate() {
let Some(subs) = self.flag_subs.get(*part) else {
continue;
};
Self::push_flag_substitutions(parts, i, subs, command, mutations);
}
}
fn push_flag_substitutions(
parts: &[&str],
i: usize,
subs: &[&'static str],
command: &str,
mutations: &mut Vec<String>,
) {
for sub in subs {
let mut new_parts: Vec<&str> = parts.to_vec();
if sub.is_empty() {
new_parts.remove(i);
} else {
new_parts[i] = sub;
}
let new_cmd = new_parts.join(" ");
if !new_cmd.is_empty() && new_cmd != command {
mutations.push(new_cmd);
}
}
}
fn append_common_variations(command: &str, mutations: &mut Vec<String>) {
if command.contains("--") {
return;
}
if command.starts_with("git ") {
mutations.push(format!("{command} --verbose"));
}
if command.starts_with("cargo ") {
mutations.push(format!("{command} --release"));
mutations.push(format!("{command} --all-features"));
}
}
pub fn mutate_batch(&self, commands: &[String]) -> Vec<String> {
let mut all_mutations = Vec::new();
let mut seen = HashSet::new();
for cmd in commands {
if seen.insert(cmd.clone()) {
all_mutations.push(cmd.clone());
}
for mutation in self.mutate(cmd) {
if seen.insert(mutation.clone()) {
all_mutations.push(mutation);
}
}
}
all_mutations
}
}
impl Default for CommandMutator {
fn default() -> Self {
Self::new()
}
}
pub struct CoverageGuidedGenerator {
known_ngrams: HashSet<String>,
n: usize,
}
impl CoverageGuidedGenerator {
pub fn new(known_ngrams: HashSet<String>, n: usize) -> Self {
Self { known_ngrams, n }
}
pub fn generate(&self, base_commands: &[String], count: usize) -> Vec<String> {
let mut generated = Vec::new();
let mut new_ngrams_added = HashSet::new();
for cmd in base_commands {
let ngrams = self.extract_ngrams(cmd);
let new_count = ngrams
.iter()
.filter(|ng| !self.known_ngrams.contains(*ng))
.count();
if new_count > 0 {
generated.push((cmd.clone(), new_count));
for ng in ngrams {
if !self.known_ngrams.contains(&ng) {
new_ngrams_added.insert(ng);
}
}
}
if generated.len() >= count * 2 {
break;
}
}
generated.sort_by(|a, b| b.1.cmp(&a.1));
generated
.into_iter()
.take(count)
.map(|(cmd, _)| cmd)
.collect()
}
fn extract_ngrams(&self, command: &str) -> Vec<String> {
let tokens: Vec<&str> = command.split_whitespace().collect();
let mut ngrams = Vec::new();
if !tokens.is_empty() {
ngrams.push(tokens[0].to_string());
}
for i in 0..tokens.len() {
let start = i.saturating_sub(self.n - 1);
let context = tokens[start..=i].join(" ");
ngrams.push(context);
}
ngrams
}
pub fn coverage_report(&self, commands: &[String]) -> CoverageReport {
let mut total_ngrams = HashSet::new();
let mut new_ngrams = HashSet::new();
for cmd in commands {
for ng in self.extract_ngrams(cmd) {
total_ngrams.insert(ng.clone());
if !self.known_ngrams.contains(&ng) {
new_ngrams.insert(ng);
}
}
}
CoverageReport {
known_ngrams: self.known_ngrams.len(),
total_ngrams: total_ngrams.len(),
new_ngrams: new_ngrams.len(),
coverage_gain: if total_ngrams.is_empty() {
0.0
} else {
new_ngrams.len() as f32 / total_ngrams.len() as f32
},
}
}
}
#[derive(Debug, Clone)]
pub struct CoverageReport {
pub known_ngrams: usize,
pub total_ngrams: usize,
pub new_ngrams: usize,
pub coverage_gain: f32,
}
pub struct SyntheticPipeline {
generator: CommandGenerator,
mutator: CommandMutator,
}
impl SyntheticPipeline {
#[must_use]
pub fn new() -> Self {
Self {
generator: CommandGenerator::new(),
mutator: CommandMutator::new(),
}
}
pub fn generate(
&self,
real_history: &[String],
known_ngrams: HashSet<String>,
count: usize,
) -> SyntheticResult {
let template_commands = self.generator.generate(count * 2);
let mutated_commands = self.mutator.mutate_batch(real_history);
let mut all_candidates: Vec<String> = template_commands;
all_candidates.extend(mutated_commands);
let coverage_gen = CoverageGuidedGenerator::new(known_ngrams.clone(), 3);
let selected = coverage_gen.generate(&all_candidates, count);
let report = coverage_gen.coverage_report(&selected);
SyntheticResult {
commands: selected,
report,
}
}
}
impl Default for SyntheticPipeline {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug)]
pub struct SyntheticResult {
pub commands: Vec<String>,
pub report: CoverageReport,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_command_generator_creates_commands() {
let gen = CommandGenerator::new();
let commands = gen.generate(1000);
assert!(commands.len() >= 500);
assert!(commands.iter().any(|c| c.starts_with("git")));
assert!(commands.iter().any(|c| c.starts_with("cargo")));
}
#[test]
fn test_command_generator_no_duplicates() {
let gen = CommandGenerator::new();
let commands = gen.generate(1000);
let unique: HashSet<_> = commands.iter().collect();
assert_eq!(commands.len(), unique.len());
}
#[test]
fn test_mutator_creates_variations() {
let mutator = CommandMutator::new();
let mutations = mutator.mutate("git commit -m test");
assert!(!mutations.is_empty());
assert!(mutations
.iter()
.any(|m| m.contains("add") || m.contains("status")));
}
#[test]
fn test_mutator_flag_substitution() {
let mutator = CommandMutator::new();
let mutations = mutator.mutate("cargo build --release");
assert!(mutations
.iter()
.any(|m| !m.contains("--release") || m.contains("--debug")));
}
#[test]
fn test_coverage_guided_prioritizes_new_ngrams() {
let known: HashSet<String> = vec!["git".to_string(), "git commit".to_string()]
.into_iter()
.collect();
let gen = CoverageGuidedGenerator::new(known, 3);
let candidates = vec![
"git commit".to_string(), "cargo test".to_string(), ];
let selected = gen.generate(&candidates, 1);
assert_eq!(selected.len(), 1);
assert!(selected[0].contains("cargo")); }
#[test]
fn test_pipeline_generates_diverse_data() {
let pipeline = SyntheticPipeline::new();
let history = vec!["git status".to_string(), "cargo test".to_string()];
let known = HashSet::new();
let result = pipeline.generate(&history, known, 50);
assert!(!result.commands.is_empty());
assert!(result.report.new_ngrams > 0);
}
#[test]
fn test_coverage_report_accuracy() {
let known: HashSet<String> = vec!["git".to_string()].into_iter().collect();
let gen = CoverageGuidedGenerator::new(known, 2);
let commands = vec!["git status".to_string(), "cargo test".to_string()];
let report = gen.coverage_report(&commands);
assert_eq!(report.known_ngrams, 1);
assert!(report.new_ngrams > 0);
assert!(report.coverage_gain > 0.0);
}
}