Skip to main content

aprender_shell/
synthetic_default_command_mutator.rs

1
2impl Default for CommandGenerator {
3    fn default() -> Self {
4        Self::new()
5    }
6}
7
8/// Mutation engine for shell commands
9pub struct CommandMutator {
10    /// Flag substitutions
11    flag_subs: HashMap<&'static str, Vec<&'static str>>,
12    /// Command substitutions
13    cmd_subs: HashMap<&'static str, Vec<&'static str>>,
14}
15
16impl CommandMutator {
17    /// Create new mutator with default rules
18    #[must_use]
19    pub fn new() -> Self {
20        let mut flag_subs = HashMap::new();
21        flag_subs.insert("-m", vec!["-am", "--message", "-m"]);
22        flag_subs.insert("--release", vec!["--debug", ""]);
23        flag_subs.insert("--lib", vec!["--bin", "--doc", "--test", ""]);
24        flag_subs.insert("-v", vec!["-vv", "-vvv", "--verbose", ""]);
25        flag_subs.insert("-a", vec!["--all", ""]);
26        flag_subs.insert("-f", vec!["--force", ""]);
27        flag_subs.insert("-n", vec!["--dry-run", ""]);
28        flag_subs.insert("-i", vec!["--interactive", ""]);
29        flag_subs.insert("-r", vec!["-R", "--recursive", ""]);
30
31        let mut cmd_subs = HashMap::new();
32        cmd_subs.insert("commit", vec!["add", "status", "diff", "log"]);
33        cmd_subs.insert("push", vec!["pull", "fetch"]);
34        cmd_subs.insert("test", vec!["build", "check", "run", "bench"]);
35        cmd_subs.insert("install", vec!["uninstall", "update", "add", "remove"]);
36        cmd_subs.insert("start", vec!["stop", "restart", "status"]);
37        cmd_subs.insert("up", vec!["down", "restart", "logs"]);
38        cmd_subs.insert("create", vec!["delete", "update", "get", "describe"]);
39
40        Self {
41            flag_subs,
42            cmd_subs,
43        }
44    }
45
46    /// Generate mutations of a command
47    pub fn mutate(&self, command: &str) -> Vec<String> {
48        let mut mutations = Vec::new();
49        let parts: Vec<&str> = command.split_whitespace().collect();
50
51        if parts.is_empty() {
52            return mutations;
53        }
54
55        // Mutate subcommand (second token usually)
56        if parts.len() >= 2 {
57            if let Some(subs) = self.cmd_subs.get(parts[1]) {
58                for sub in subs {
59                    let mut new_parts = parts.clone();
60                    new_parts[1] = sub;
61                    mutations.push(new_parts.join(" "));
62                }
63            }
64        }
65
66        // Mutate flags
67        for (i, part) in parts.iter().enumerate() {
68            if let Some(subs) = self.flag_subs.get(*part) {
69                for sub in subs {
70                    let mut new_parts: Vec<&str> = parts.clone();
71                    if sub.is_empty() {
72                        new_parts.remove(i);
73                    } else {
74                        new_parts[i] = sub;
75                    }
76                    let new_cmd = new_parts.join(" ");
77                    if !new_cmd.is_empty() && new_cmd != command {
78                        mutations.push(new_cmd);
79                    }
80                }
81            }
82        }
83
84        // Add common flag variations
85        if !command.contains("--") {
86            if command.starts_with("git ") {
87                mutations.push(format!("{} --verbose", command));
88            }
89            if command.starts_with("cargo ") {
90                mutations.push(format!("{} --release", command));
91                mutations.push(format!("{} --all-features", command));
92            }
93        }
94
95        // Remove duplicates
96        mutations.sort();
97        mutations.dedup();
98        mutations
99    }
100
101    /// Mutate a batch of commands
102    pub fn mutate_batch(&self, commands: &[String]) -> Vec<String> {
103        let mut all_mutations = Vec::new();
104        let mut seen = HashSet::new();
105
106        for cmd in commands {
107            if seen.insert(cmd.clone()) {
108                all_mutations.push(cmd.clone());
109            }
110            for mutation in self.mutate(cmd) {
111                if seen.insert(mutation.clone()) {
112                    all_mutations.push(mutation);
113                }
114            }
115        }
116
117        all_mutations
118    }
119}
120
121impl Default for CommandMutator {
122    fn default() -> Self {
123        Self::new()
124    }
125}
126
127/// Coverage-guided generator that fills gaps in n-gram model
128pub struct CoverageGuidedGenerator {
129    /// Known n-grams from training
130    known_ngrams: HashSet<String>,
131    /// Target n-gram size
132    n: usize,
133}
134
135impl CoverageGuidedGenerator {
136    /// Create from existing n-gram counts
137    pub fn new(known_ngrams: HashSet<String>, n: usize) -> Self {
138        Self { known_ngrams, n }
139    }
140
141    /// Generate commands that exercise underrepresented n-grams
142    pub fn generate(&self, base_commands: &[String], count: usize) -> Vec<String> {
143        let mut generated = Vec::new();
144        let mut new_ngrams_added = HashSet::new();
145
146        // Find which commands introduce new n-grams
147        for cmd in base_commands {
148            let ngrams = self.extract_ngrams(cmd);
149            let new_count = ngrams
150                .iter()
151                .filter(|ng| !self.known_ngrams.contains(*ng))
152                .count();
153
154            if new_count > 0 {
155                generated.push((cmd.clone(), new_count));
156                for ng in ngrams {
157                    if !self.known_ngrams.contains(&ng) {
158                        new_ngrams_added.insert(ng);
159                    }
160                }
161            }
162
163            if generated.len() >= count * 2 {
164                break;
165            }
166        }
167
168        // Sort by coverage gain (descending)
169        generated.sort_by(|a, b| b.1.cmp(&a.1));
170
171        // Return top commands
172        generated
173            .into_iter()
174            .take(count)
175            .map(|(cmd, _)| cmd)
176            .collect()
177    }
178
179    fn extract_ngrams(&self, command: &str) -> Vec<String> {
180        let tokens: Vec<&str> = command.split_whitespace().collect();
181        let mut ngrams = Vec::new();
182
183        // First token as context
184        if !tokens.is_empty() {
185            ngrams.push(tokens[0].to_string());
186        }
187
188        // Build n-grams
189        for i in 0..tokens.len() {
190            let start = i.saturating_sub(self.n - 1);
191            let context = tokens[start..=i].join(" ");
192            ngrams.push(context);
193        }
194
195        ngrams
196    }
197
198    /// Report coverage stats
199    pub fn coverage_report(&self, commands: &[String]) -> CoverageReport {
200        let mut total_ngrams = HashSet::new();
201        let mut new_ngrams = HashSet::new();
202
203        for cmd in commands {
204            for ng in self.extract_ngrams(cmd) {
205                total_ngrams.insert(ng.clone());
206                if !self.known_ngrams.contains(&ng) {
207                    new_ngrams.insert(ng);
208                }
209            }
210        }
211
212        CoverageReport {
213            known_ngrams: self.known_ngrams.len(),
214            total_ngrams: total_ngrams.len(),
215            new_ngrams: new_ngrams.len(),
216            coverage_gain: if total_ngrams.is_empty() {
217                0.0
218            } else {
219                new_ngrams.len() as f32 / total_ngrams.len() as f32
220            },
221        }
222    }
223}
224
225/// Coverage statistics
226#[derive(Debug, Clone)]
227pub struct CoverageReport {
228    /// N-grams already in model
229    pub known_ngrams: usize,
230    /// Total n-grams in synthetic data
231    pub total_ngrams: usize,
232    /// New n-grams from synthetic data
233    pub new_ngrams: usize,
234    /// Percentage of synthetic data that's new
235    pub coverage_gain: f32,
236}
237
238/// Combined synthetic data pipeline
239pub struct SyntheticPipeline {
240    generator: CommandGenerator,
241    mutator: CommandMutator,
242}
243
244impl SyntheticPipeline {
245    /// Create new pipeline
246    #[must_use]
247    pub fn new() -> Self {
248        Self {
249            generator: CommandGenerator::new(),
250            mutator: CommandMutator::new(),
251        }
252    }
253
254    /// Generate synthetic training data
255    ///
256    /// 1. Generate base commands from templates
257    /// 2. Mutate real history for variations
258    /// 3. Use coverage-guided selection
259    pub fn generate(
260        &self,
261        real_history: &[String],
262        known_ngrams: HashSet<String>,
263        count: usize,
264    ) -> SyntheticResult {
265        // Step 1: Generate template commands
266        let template_commands = self.generator.generate(count * 2);
267
268        // Step 2: Mutate real history
269        let mutated_commands = self.mutator.mutate_batch(real_history);
270
271        // Step 3: Combine all candidates
272        let mut all_candidates: Vec<String> = template_commands;
273        all_candidates.extend(mutated_commands);
274
275        // Step 4: Coverage-guided selection
276        let coverage_gen = CoverageGuidedGenerator::new(known_ngrams.clone(), 3);
277        let selected = coverage_gen.generate(&all_candidates, count);
278
279        // Step 5: Generate report
280        let report = coverage_gen.coverage_report(&selected);
281
282        SyntheticResult {
283            commands: selected,
284            report,
285        }
286    }
287}
288
289impl Default for SyntheticPipeline {
290    fn default() -> Self {
291        Self::new()
292    }
293}
294
295/// Result of synthetic data generation
296#[derive(Debug)]
297pub struct SyntheticResult {
298    /// Generated commands
299    pub commands: Vec<String>,
300    /// Coverage report
301    pub report: CoverageReport,
302}
303
304#[cfg(test)]
305mod tests {
306    use super::*;
307
308    #[test]
309    fn test_command_generator_creates_commands() {
310        let gen = CommandGenerator::new();
311        let commands = gen.generate(1000);
312        assert!(commands.len() >= 500);
313        assert!(commands.iter().any(|c| c.starts_with("git")));
314        assert!(commands.iter().any(|c| c.starts_with("cargo")));
315    }
316
317    #[test]
318    fn test_command_generator_no_duplicates() {
319        let gen = CommandGenerator::new();
320        let commands = gen.generate(1000);
321        let unique: HashSet<_> = commands.iter().collect();
322        assert_eq!(commands.len(), unique.len());
323    }
324
325    #[test]
326    fn test_mutator_creates_variations() {
327        let mutator = CommandMutator::new();
328        let mutations = mutator.mutate("git commit -m test");
329        assert!(!mutations.is_empty());
330        assert!(mutations
331            .iter()
332            .any(|m| m.contains("add") || m.contains("status")));
333    }
334
335    #[test]
336    fn test_mutator_flag_substitution() {
337        let mutator = CommandMutator::new();
338        let mutations = mutator.mutate("cargo build --release");
339        assert!(mutations
340            .iter()
341            .any(|m| !m.contains("--release") || m.contains("--debug")));
342    }
343
344    #[test]
345    fn test_coverage_guided_prioritizes_new_ngrams() {
346        let known: HashSet<String> = vec!["git".to_string(), "git commit".to_string()]
347            .into_iter()
348            .collect();
349        let gen = CoverageGuidedGenerator::new(known, 3);
350
351        let candidates = vec![
352            "git commit".to_string(), // Known
353            "cargo test".to_string(), // New
354        ];
355
356        let selected = gen.generate(&candidates, 1);
357        assert_eq!(selected.len(), 1);
358        assert!(selected[0].contains("cargo")); // Should prefer new
359    }
360
361    #[test]
362    fn test_pipeline_generates_diverse_data() {
363        let pipeline = SyntheticPipeline::new();
364        let history = vec!["git status".to_string(), "cargo test".to_string()];
365        let known = HashSet::new();
366
367        let result = pipeline.generate(&history, known, 50);
368        assert!(!result.commands.is_empty());
369        assert!(result.report.new_ngrams > 0);
370    }
371
372    #[test]
373    fn test_coverage_report_accuracy() {
374        let known: HashSet<String> = vec!["git".to_string()].into_iter().collect();
375        let gen = CoverageGuidedGenerator::new(known, 2);
376
377        let commands = vec!["git status".to_string(), "cargo test".to_string()];
378        let report = gen.coverage_report(&commands);
379
380        assert_eq!(report.known_ngrams, 1);
381        assert!(report.new_ngrams > 0);
382        assert!(report.coverage_gain > 0.0);
383    }
384}