Skip to main content

aprender_shell/
synthetic_default_command_mutator.rs

1
2impl Default for CommandGenerator {
3    fn default() -> Self {
4        Self::new()
5    }
6}
7
8/// Mutation engine for shell commands
9pub struct CommandMutator {
10    /// Flag substitutions
11    flag_subs: HashMap<&'static str, Vec<&'static str>>,
12    /// Command substitutions
13    cmd_subs: HashMap<&'static str, Vec<&'static str>>,
14}
15
16impl CommandMutator {
17    /// Create new mutator with default rules
18    #[must_use]
19    pub fn new() -> Self {
20        let mut flag_subs = HashMap::new();
21        flag_subs.insert("-m", vec!["-am", "--message", "-m"]);
22        flag_subs.insert("--release", vec!["--debug", ""]);
23        flag_subs.insert("--lib", vec!["--bin", "--doc", "--test", ""]);
24        flag_subs.insert("-v", vec!["-vv", "-vvv", "--verbose", ""]);
25        flag_subs.insert("-a", vec!["--all", ""]);
26        flag_subs.insert("-f", vec!["--force", ""]);
27        flag_subs.insert("-n", vec!["--dry-run", ""]);
28        flag_subs.insert("-i", vec!["--interactive", ""]);
29        flag_subs.insert("-r", vec!["-R", "--recursive", ""]);
30
31        let mut cmd_subs = HashMap::new();
32        cmd_subs.insert("commit", vec!["add", "status", "diff", "log"]);
33        cmd_subs.insert("push", vec!["pull", "fetch"]);
34        cmd_subs.insert("test", vec!["build", "check", "run", "bench"]);
35        cmd_subs.insert("install", vec!["uninstall", "update", "add", "remove"]);
36        cmd_subs.insert("start", vec!["stop", "restart", "status"]);
37        cmd_subs.insert("up", vec!["down", "restart", "logs"]);
38        cmd_subs.insert("create", vec!["delete", "update", "get", "describe"]);
39
40        Self {
41            flag_subs,
42            cmd_subs,
43        }
44    }
45
46    /// Generate mutations of a command
47    pub fn mutate(&self, command: &str) -> Vec<String> {
48        let parts: Vec<&str> = command.split_whitespace().collect();
49        if parts.is_empty() {
50            return Vec::new();
51        }
52
53        let mut mutations = Vec::new();
54        self.mutate_subcommand(&parts, &mut mutations);
55        self.mutate_flags(&parts, command, &mut mutations);
56        Self::append_common_variations(command, &mut mutations);
57
58        mutations.sort();
59        mutations.dedup();
60        mutations
61    }
62
63    /// Substitute the second token (subcommand) with known alternatives.
64    fn mutate_subcommand(&self, parts: &[&str], mutations: &mut Vec<String>) {
65        if parts.len() < 2 {
66            return;
67        }
68        let Some(subs) = self.cmd_subs.get(parts[1]) else {
69            return;
70        };
71        for sub in subs {
72            let mut new_parts = parts.to_vec();
73            new_parts[1] = sub;
74            mutations.push(new_parts.join(" "));
75        }
76    }
77
78    /// Substitute matching flag tokens anywhere in the command.
79    fn mutate_flags(&self, parts: &[&str], command: &str, mutations: &mut Vec<String>) {
80        for (i, part) in parts.iter().enumerate() {
81            let Some(subs) = self.flag_subs.get(*part) else {
82                continue;
83            };
84            Self::push_flag_substitutions(parts, i, subs, command, mutations);
85        }
86    }
87
88    fn push_flag_substitutions(
89        parts: &[&str],
90        i: usize,
91        subs: &[&'static str],
92        command: &str,
93        mutations: &mut Vec<String>,
94    ) {
95        for sub in subs {
96            let mut new_parts: Vec<&str> = parts.to_vec();
97            if sub.is_empty() {
98                new_parts.remove(i);
99            } else {
100                new_parts[i] = sub;
101            }
102            let new_cmd = new_parts.join(" ");
103            if !new_cmd.is_empty() && new_cmd != command {
104                mutations.push(new_cmd);
105            }
106        }
107    }
108
109    /// Add well-known flag variations for git/cargo when no long flags exist.
110    fn append_common_variations(command: &str, mutations: &mut Vec<String>) {
111        if command.contains("--") {
112            return;
113        }
114        if command.starts_with("git ") {
115            mutations.push(format!("{command} --verbose"));
116        }
117        if command.starts_with("cargo ") {
118            mutations.push(format!("{command} --release"));
119            mutations.push(format!("{command} --all-features"));
120        }
121    }
122
123    /// Mutate a batch of commands
124    pub fn mutate_batch(&self, commands: &[String]) -> Vec<String> {
125        let mut all_mutations = Vec::new();
126        let mut seen = HashSet::new();
127
128        for cmd in commands {
129            if seen.insert(cmd.clone()) {
130                all_mutations.push(cmd.clone());
131            }
132            for mutation in self.mutate(cmd) {
133                if seen.insert(mutation.clone()) {
134                    all_mutations.push(mutation);
135                }
136            }
137        }
138
139        all_mutations
140    }
141}
142
143impl Default for CommandMutator {
144    fn default() -> Self {
145        Self::new()
146    }
147}
148
149/// Coverage-guided generator that fills gaps in n-gram model
150pub struct CoverageGuidedGenerator {
151    /// Known n-grams from training
152    known_ngrams: HashSet<String>,
153    /// Target n-gram size
154    n: usize,
155}
156
157impl CoverageGuidedGenerator {
158    /// Create from existing n-gram counts
159    pub fn new(known_ngrams: HashSet<String>, n: usize) -> Self {
160        Self { known_ngrams, n }
161    }
162
163    /// Generate commands that exercise underrepresented n-grams
164    pub fn generate(&self, base_commands: &[String], count: usize) -> Vec<String> {
165        let mut generated = Vec::new();
166        let mut new_ngrams_added = HashSet::new();
167
168        // Find which commands introduce new n-grams
169        for cmd in base_commands {
170            let ngrams = self.extract_ngrams(cmd);
171            let new_count = ngrams
172                .iter()
173                .filter(|ng| !self.known_ngrams.contains(*ng))
174                .count();
175
176            if new_count > 0 {
177                generated.push((cmd.clone(), new_count));
178                for ng in ngrams {
179                    if !self.known_ngrams.contains(&ng) {
180                        new_ngrams_added.insert(ng);
181                    }
182                }
183            }
184
185            if generated.len() >= count * 2 {
186                break;
187            }
188        }
189
190        // Sort by coverage gain (descending)
191        generated.sort_by(|a, b| b.1.cmp(&a.1));
192
193        // Return top commands
194        generated
195            .into_iter()
196            .take(count)
197            .map(|(cmd, _)| cmd)
198            .collect()
199    }
200
201    fn extract_ngrams(&self, command: &str) -> Vec<String> {
202        let tokens: Vec<&str> = command.split_whitespace().collect();
203        let mut ngrams = Vec::new();
204
205        // First token as context
206        if !tokens.is_empty() {
207            ngrams.push(tokens[0].to_string());
208        }
209
210        // Build n-grams
211        for i in 0..tokens.len() {
212            let start = i.saturating_sub(self.n - 1);
213            let context = tokens[start..=i].join(" ");
214            ngrams.push(context);
215        }
216
217        ngrams
218    }
219
220    /// Report coverage stats
221    pub fn coverage_report(&self, commands: &[String]) -> CoverageReport {
222        let mut total_ngrams = HashSet::new();
223        let mut new_ngrams = HashSet::new();
224
225        for cmd in commands {
226            for ng in self.extract_ngrams(cmd) {
227                total_ngrams.insert(ng.clone());
228                if !self.known_ngrams.contains(&ng) {
229                    new_ngrams.insert(ng);
230                }
231            }
232        }
233
234        CoverageReport {
235            known_ngrams: self.known_ngrams.len(),
236            total_ngrams: total_ngrams.len(),
237            new_ngrams: new_ngrams.len(),
238            coverage_gain: if total_ngrams.is_empty() {
239                0.0
240            } else {
241                new_ngrams.len() as f32 / total_ngrams.len() as f32
242            },
243        }
244    }
245}
246
247/// Coverage statistics
248#[derive(Debug, Clone)]
249pub struct CoverageReport {
250    /// N-grams already in model
251    pub known_ngrams: usize,
252    /// Total n-grams in synthetic data
253    pub total_ngrams: usize,
254    /// New n-grams from synthetic data
255    pub new_ngrams: usize,
256    /// Percentage of synthetic data that's new
257    pub coverage_gain: f32,
258}
259
260/// Combined synthetic data pipeline
261pub struct SyntheticPipeline {
262    generator: CommandGenerator,
263    mutator: CommandMutator,
264}
265
266impl SyntheticPipeline {
267    /// Create new pipeline
268    #[must_use]
269    pub fn new() -> Self {
270        Self {
271            generator: CommandGenerator::new(),
272            mutator: CommandMutator::new(),
273        }
274    }
275
276    /// Generate synthetic training data
277    ///
278    /// 1. Generate base commands from templates
279    /// 2. Mutate real history for variations
280    /// 3. Use coverage-guided selection
281    pub fn generate(
282        &self,
283        real_history: &[String],
284        known_ngrams: HashSet<String>,
285        count: usize,
286    ) -> SyntheticResult {
287        // Step 1: Generate template commands
288        let template_commands = self.generator.generate(count * 2);
289
290        // Step 2: Mutate real history
291        let mutated_commands = self.mutator.mutate_batch(real_history);
292
293        // Step 3: Combine all candidates
294        let mut all_candidates: Vec<String> = template_commands;
295        all_candidates.extend(mutated_commands);
296
297        // Step 4: Coverage-guided selection
298        let coverage_gen = CoverageGuidedGenerator::new(known_ngrams.clone(), 3);
299        let selected = coverage_gen.generate(&all_candidates, count);
300
301        // Step 5: Generate report
302        let report = coverage_gen.coverage_report(&selected);
303
304        SyntheticResult {
305            commands: selected,
306            report,
307        }
308    }
309}
310
311impl Default for SyntheticPipeline {
312    fn default() -> Self {
313        Self::new()
314    }
315}
316
317/// Result of synthetic data generation
318#[derive(Debug)]
319pub struct SyntheticResult {
320    /// Generated commands
321    pub commands: Vec<String>,
322    /// Coverage report
323    pub report: CoverageReport,
324}
325
326#[cfg(test)]
327mod tests {
328    use super::*;
329
330    #[test]
331    fn test_command_generator_creates_commands() {
332        let gen = CommandGenerator::new();
333        let commands = gen.generate(1000);
334        assert!(commands.len() >= 500);
335        assert!(commands.iter().any(|c| c.starts_with("git")));
336        assert!(commands.iter().any(|c| c.starts_with("cargo")));
337    }
338
339    #[test]
340    fn test_command_generator_no_duplicates() {
341        let gen = CommandGenerator::new();
342        let commands = gen.generate(1000);
343        let unique: HashSet<_> = commands.iter().collect();
344        assert_eq!(commands.len(), unique.len());
345    }
346
347    #[test]
348    fn test_mutator_creates_variations() {
349        let mutator = CommandMutator::new();
350        let mutations = mutator.mutate("git commit -m test");
351        assert!(!mutations.is_empty());
352        assert!(mutations
353            .iter()
354            .any(|m| m.contains("add") || m.contains("status")));
355    }
356
357    #[test]
358    fn test_mutator_flag_substitution() {
359        let mutator = CommandMutator::new();
360        let mutations = mutator.mutate("cargo build --release");
361        assert!(mutations
362            .iter()
363            .any(|m| !m.contains("--release") || m.contains("--debug")));
364    }
365
366    #[test]
367    fn test_coverage_guided_prioritizes_new_ngrams() {
368        let known: HashSet<String> = vec!["git".to_string(), "git commit".to_string()]
369            .into_iter()
370            .collect();
371        let gen = CoverageGuidedGenerator::new(known, 3);
372
373        let candidates = vec![
374            "git commit".to_string(), // Known
375            "cargo test".to_string(), // New
376        ];
377
378        let selected = gen.generate(&candidates, 1);
379        assert_eq!(selected.len(), 1);
380        assert!(selected[0].contains("cargo")); // Should prefer new
381    }
382
383    #[test]
384    fn test_pipeline_generates_diverse_data() {
385        let pipeline = SyntheticPipeline::new();
386        let history = vec!["git status".to_string(), "cargo test".to_string()];
387        let known = HashSet::new();
388
389        let result = pipeline.generate(&history, known, 50);
390        assert!(!result.commands.is_empty());
391        assert!(result.report.new_ngrams > 0);
392    }
393
394    #[test]
395    fn test_coverage_report_accuracy() {
396        let known: HashSet<String> = vec!["git".to_string()].into_iter().collect();
397        let gen = CoverageGuidedGenerator::new(known, 2);
398
399        let commands = vec!["git status".to_string(), "cargo test".to_string()];
400        let report = gen.coverage_report(&commands);
401
402        assert_eq!(report.known_ngrams, 1);
403        assert!(report.new_ngrams > 0);
404        assert!(report.coverage_gain > 0.0);
405    }
406}