Skip to main content

aprender/synthetic/
shell.rs

1//! Shell Autocomplete Synthetic Data Generator.
2//!
3//! Generates synthetic training data for shell command autocomplete SLMs.
4//! Uses template substitution, argument permutation, and context variation
5//! to augment limited shell command corpora.
6//!
7//! # References
8//!
9//! - Jia & Liang (2016). Data Recombination for Neural Semantic Parsing. ACL.
10//! - Section 4 of `AutoML` with Synthetic Data Specification.
11
12use std::collections::{HashMap, HashSet};
13
14use super::{SyntheticConfig, SyntheticGenerator};
15use crate::error::Result;
16
17// ============================================================================
18// ShellSample: Command with context
19// ============================================================================
20
21/// A shell command sample with execution context.
22///
23/// Represents a single training example for shell autocomplete, including
24/// command history, current input prefix, and working directory context.
25///
26/// # Example
27///
28/// ```
29/// use aprender::synthetic::shell::ShellSample;
30///
31/// let sample = ShellSample::new("git st", "git status")
32///     .with_history(vec!["cd project".to_string()])
33///     .with_cwd("/home/user/project");
34///
35/// assert_eq!(sample.prefix(), "git st");
36/// assert_eq!(sample.completion(), "git status");
37/// ```
38#[derive(Debug, Clone, PartialEq)]
39pub struct ShellSample {
40    /// Previous commands in the session (context).
41    history: Vec<String>,
42    /// Current partial input (prefix to complete).
43    prefix: String,
44    /// Completed command (label).
45    completion: String,
46    /// Working directory context.
47    cwd: String,
48}
49
50impl ShellSample {
51    /// Create a new shell sample.
52    ///
53    /// # Arguments
54    ///
55    /// * `prefix` - Current partial input
56    /// * `completion` - Full completed command
57    #[must_use]
58    pub fn new(prefix: impl Into<String>, completion: impl Into<String>) -> Self {
59        Self {
60            history: Vec::new(),
61            prefix: prefix.into(),
62            completion: completion.into(),
63            cwd: String::new(),
64        }
65    }
66
67    /// Add command history context.
68    #[must_use]
69    pub fn with_history(mut self, history: Vec<String>) -> Self {
70        self.history = history;
71        self
72    }
73
74    /// Set working directory context.
75    #[must_use]
76    pub fn with_cwd(mut self, cwd: impl Into<String>) -> Self {
77        self.cwd = cwd.into();
78        self
79    }
80
81    /// Get the prefix.
82    #[must_use]
83    pub fn prefix(&self) -> &str {
84        &self.prefix
85    }
86
87    /// Get the completion.
88    #[must_use]
89    pub fn completion(&self) -> &str {
90        &self.completion
91    }
92
93    /// Get the history.
94    #[must_use]
95    pub fn history(&self) -> &[String] {
96        &self.history
97    }
98
99    /// Get the working directory.
100    #[must_use]
101    pub fn cwd(&self) -> &str {
102        &self.cwd
103    }
104
105    /// Extract the command name (first token).
106    #[must_use]
107    pub fn command_name(&self) -> Option<&str> {
108        self.completion.split_whitespace().next()
109    }
110
111    /// Extract arguments (tokens after command name).
112    #[must_use]
113    pub fn arguments(&self) -> Vec<&str> {
114        self.completion.split_whitespace().skip(1).collect()
115    }
116
117    /// Check if the completion starts with the prefix.
118    #[must_use]
119    pub fn is_valid_completion(&self) -> bool {
120        self.completion.starts_with(&self.prefix)
121    }
122}
123
124// ============================================================================
125// ShellGrammar: Command validation
126// ============================================================================
127
128/// Shell command grammar for validation.
129///
130/// Validates that generated commands follow expected patterns
131/// and use known command names, subcommands, and options.
132///
133/// # Example
134///
135/// ```
136/// use aprender::synthetic::shell::ShellGrammar;
137///
138/// let grammar = ShellGrammar::common_commands();
139///
140/// assert!(grammar.is_valid_command("git status"));
141/// assert!(grammar.is_valid_command("cargo build --release"));
142/// assert!(!grammar.is_valid_command(""));
143/// ```
144#[derive(Debug, Clone)]
145pub struct ShellGrammar {
146    /// Known command names.
147    commands: HashSet<String>,
148    /// Subcommands for each command: command -> [subcommands].
149    subcommands: HashMap<String, HashSet<String>>,
150    /// Common options that apply to many commands.
151    common_options: HashSet<String>,
152}
153
154impl ShellGrammar {
155    /// Create an empty grammar.
156    #[must_use]
157    pub fn new() -> Self {
158        Self {
159            commands: HashSet::new(),
160            subcommands: HashMap::new(),
161            common_options: HashSet::new(),
162        }
163    }
164
165    /// Create a grammar with common shell commands.
166    #[must_use]
167    pub fn common_commands() -> Self {
168        let mut grammar = Self::new();
169
170        // Git commands
171        grammar.add_command("git");
172        grammar.add_subcommands(
173            "git",
174            &[
175                "status", "commit", "push", "pull", "checkout", "branch", "merge", "rebase", "log",
176                "diff", "add", "reset", "stash", "fetch", "clone", "init",
177            ],
178        );
179
180        // Cargo commands
181        grammar.add_command("cargo");
182        grammar.add_subcommands(
183            "cargo",
184            &[
185                "build", "run", "test", "check", "clippy", "fmt", "doc", "publish", "new", "init",
186                "add", "remove", "update", "bench", "clean",
187            ],
188        );
189
190        // npm commands
191        grammar.add_command("npm");
192        grammar.add_subcommands(
193            "npm",
194            &[
195                "install", "run", "test", "start", "build", "publish", "init", "update", "audit",
196                "ci", "pack",
197            ],
198        );
199
200        // Docker commands
201        grammar.add_command("docker");
202        grammar.add_subcommands(
203            "docker",
204            &[
205                "run", "build", "push", "pull", "ps", "images", "exec", "stop", "start", "rm",
206                "rmi", "logs", "compose",
207            ],
208        );
209
210        // Common Unix commands
211        for cmd in &[
212            "ls", "cd", "cp", "mv", "rm", "mkdir", "rmdir", "cat", "grep", "find", "chmod",
213            "chown", "curl", "wget", "ssh", "scp", "tar", "zip", "unzip", "make", "python", "node",
214        ] {
215            grammar.add_command(cmd);
216        }
217
218        // Common options
219        grammar.add_common_options(&[
220            "-h",
221            "--help",
222            "-v",
223            "--version",
224            "-q",
225            "--quiet",
226            "-f",
227            "--force",
228            "-r",
229            "--recursive",
230            "-n",
231            "--dry-run",
232        ]);
233
234        grammar
235    }
236
237    /// Add a known command.
238    pub fn add_command(&mut self, command: &str) {
239        self.commands.insert(command.to_string());
240    }
241
242    /// Add subcommands for a command.
243    pub fn add_subcommands(&mut self, command: &str, subs: &[&str]) {
244        let entry = self.subcommands.entry(command.to_string()).or_default();
245        for sub in subs {
246            entry.insert((*sub).to_string());
247        }
248    }
249
250    /// Add common options.
251    pub fn add_common_options(&mut self, options: &[&str]) {
252        for opt in options {
253            self.common_options.insert((*opt).to_string());
254        }
255    }
256
257    /// Check if a command string is valid.
258    ///
259    /// A command is valid if:
260    /// - It's non-empty
261    /// - The first token is a known command OR starts with known command
262    /// - If it has subcommands defined, the second token should be a known subcommand
263    #[must_use]
264    pub fn is_valid_command(&self, command: &str) -> bool {
265        let tokens: Vec<&str> = command.split_whitespace().collect();
266        if tokens.is_empty() {
267            return false;
268        }
269
270        let cmd_name = tokens[0];
271
272        // Check if command is known
273        if !self.commands.contains(cmd_name) {
274            return false;
275        }
276
277        // If we have subcommand definitions and there's a second token
278        if let Some(subs) = self.subcommands.get(cmd_name) {
279            if tokens.len() > 1 {
280                let second = tokens[1];
281                // Allow if it's a known subcommand or starts with '-' (option)
282                if !subs.contains(second) && !second.starts_with('-') {
283                    return false;
284                }
285            }
286        }
287
288        true
289    }
290
291    /// Check if a token is a known option.
292    #[must_use]
293    pub fn is_valid_option(&self, option: &str) -> bool {
294        option.starts_with('-') && (self.common_options.contains(option) || option.len() <= 20)
295    }
296
297    /// Get all known commands.
298    #[must_use]
299    pub fn commands(&self) -> &HashSet<String> {
300        &self.commands
301    }
302
303    /// Get subcommands for a command.
304    #[must_use]
305    pub fn get_subcommands(&self, command: &str) -> Option<&HashSet<String>> {
306        self.subcommands.get(command)
307    }
308}
309
310impl Default for ShellGrammar {
311    fn default() -> Self {
312        Self::common_commands()
313    }
314}
315
316// ============================================================================
317// ShellSyntheticGenerator: Generates synthetic shell samples
318// ============================================================================
319
320/// Configuration for shell synthetic generation.
321#[derive(Debug, Clone)]
322pub struct ShellGeneratorConfig {
323    /// Enable template-based generation.
324    pub enable_template: bool,
325    /// Enable argument permutation.
326    pub enable_permutation: bool,
327    /// Enable context variation.
328    pub enable_context_variation: bool,
329    /// Maximum arguments to permute.
330    pub max_permute_args: usize,
331}
332
333impl Default for ShellGeneratorConfig {
334    fn default() -> Self {
335        Self {
336            enable_template: true,
337            enable_permutation: true,
338            enable_context_variation: true,
339            max_permute_args: 3,
340        }
341    }
342}
343
344/// Synthetic data generator for shell autocomplete.
345///
346/// Implements three generation strategies:
347/// 1. Template substitution: Replace arguments with variants
348/// 2. Argument permutation: Reorder and add/remove options
349/// 3. Context variation: Modify history and cwd
350///
351/// # Example
352///
353/// ```
354/// use aprender::synthetic::shell::{ShellSyntheticGenerator, ShellSample};
355/// use aprender::synthetic::{SyntheticGenerator, SyntheticConfig};
356///
357/// let gen = ShellSyntheticGenerator::new();
358/// let seeds = vec![
359///     ShellSample::new("git st", "git status"),
360///     ShellSample::new("cargo b", "cargo build"),
361/// ];
362/// let config = SyntheticConfig::default().with_augmentation_ratio(1.0);
363///
364/// let synthetic = gen.generate(&seeds, &config).expect("generation failed");
365/// assert!(!synthetic.is_empty());
366/// ```
367#[derive(Debug, Clone)]
368pub struct ShellSyntheticGenerator {
369    /// Grammar for command validation.
370    grammar: ShellGrammar,
371    /// Generator configuration.
372    config: ShellGeneratorConfig,
373    /// Argument substitutions: arg -> [variants].
374    substitutions: HashMap<String, Vec<String>>,
375}
376
377include!("shell_generator_impl.rs");
378include!("shell_tests.rs");