aprender/synthetic/shell.rs
1//! Shell Autocomplete Synthetic Data Generator.
2//!
3//! Generates synthetic training data for shell command autocomplete SLMs.
4//! Uses template substitution, argument permutation, and context variation
5//! to augment limited shell command corpora.
6//!
7//! # References
8//!
9//! - Jia & Liang (2016). Data Recombination for Neural Semantic Parsing. ACL.
10//! - Section 4 of `AutoML` with Synthetic Data Specification.
11
12use std::collections::{HashMap, HashSet};
13
14use super::{SyntheticConfig, SyntheticGenerator};
15use crate::error::Result;
16
17// ============================================================================
18// ShellSample: Command with context
19// ============================================================================
20
21/// A shell command sample with execution context.
22///
23/// Represents a single training example for shell autocomplete, including
24/// command history, current input prefix, and working directory context.
25///
26/// # Example
27///
28/// ```
29/// use aprender::synthetic::shell::ShellSample;
30///
31/// let sample = ShellSample::new("git st", "git status")
32/// .with_history(vec!["cd project".to_string()])
33/// .with_cwd("/home/user/project");
34///
35/// assert_eq!(sample.prefix(), "git st");
36/// assert_eq!(sample.completion(), "git status");
37/// ```
38#[derive(Debug, Clone, PartialEq)]
39pub struct ShellSample {
40 /// Previous commands in the session (context).
41 history: Vec<String>,
42 /// Current partial input (prefix to complete).
43 prefix: String,
44 /// Completed command (label).
45 completion: String,
46 /// Working directory context.
47 cwd: String,
48}
49
50impl ShellSample {
51 /// Create a new shell sample.
52 ///
53 /// # Arguments
54 ///
55 /// * `prefix` - Current partial input
56 /// * `completion` - Full completed command
57 #[must_use]
58 pub fn new(prefix: impl Into<String>, completion: impl Into<String>) -> Self {
59 Self {
60 history: Vec::new(),
61 prefix: prefix.into(),
62 completion: completion.into(),
63 cwd: String::new(),
64 }
65 }
66
67 /// Add command history context.
68 #[must_use]
69 pub fn with_history(mut self, history: Vec<String>) -> Self {
70 self.history = history;
71 self
72 }
73
74 /// Set working directory context.
75 #[must_use]
76 pub fn with_cwd(mut self, cwd: impl Into<String>) -> Self {
77 self.cwd = cwd.into();
78 self
79 }
80
81 /// Get the prefix.
82 #[must_use]
83 pub fn prefix(&self) -> &str {
84 &self.prefix
85 }
86
87 /// Get the completion.
88 #[must_use]
89 pub fn completion(&self) -> &str {
90 &self.completion
91 }
92
93 /// Get the history.
94 #[must_use]
95 pub fn history(&self) -> &[String] {
96 &self.history
97 }
98
99 /// Get the working directory.
100 #[must_use]
101 pub fn cwd(&self) -> &str {
102 &self.cwd
103 }
104
105 /// Extract the command name (first token).
106 #[must_use]
107 pub fn command_name(&self) -> Option<&str> {
108 self.completion.split_whitespace().next()
109 }
110
111 /// Extract arguments (tokens after command name).
112 #[must_use]
113 pub fn arguments(&self) -> Vec<&str> {
114 self.completion.split_whitespace().skip(1).collect()
115 }
116
117 /// Check if the completion starts with the prefix.
118 #[must_use]
119 pub fn is_valid_completion(&self) -> bool {
120 self.completion.starts_with(&self.prefix)
121 }
122}
123
124// ============================================================================
125// ShellGrammar: Command validation
126// ============================================================================
127
128/// Shell command grammar for validation.
129///
130/// Validates that generated commands follow expected patterns
131/// and use known command names, subcommands, and options.
132///
133/// # Example
134///
135/// ```
136/// use aprender::synthetic::shell::ShellGrammar;
137///
138/// let grammar = ShellGrammar::common_commands();
139///
140/// assert!(grammar.is_valid_command("git status"));
141/// assert!(grammar.is_valid_command("cargo build --release"));
142/// assert!(!grammar.is_valid_command(""));
143/// ```
144#[derive(Debug, Clone)]
145pub struct ShellGrammar {
146 /// Known command names.
147 commands: HashSet<String>,
148 /// Subcommands for each command: command -> [subcommands].
149 subcommands: HashMap<String, HashSet<String>>,
150 /// Common options that apply to many commands.
151 common_options: HashSet<String>,
152}
153
154impl ShellGrammar {
155 /// Create an empty grammar.
156 #[must_use]
157 pub fn new() -> Self {
158 Self {
159 commands: HashSet::new(),
160 subcommands: HashMap::new(),
161 common_options: HashSet::new(),
162 }
163 }
164
165 /// Create a grammar with common shell commands.
166 #[must_use]
167 pub fn common_commands() -> Self {
168 let mut grammar = Self::new();
169
170 // Git commands
171 grammar.add_command("git");
172 grammar.add_subcommands(
173 "git",
174 &[
175 "status", "commit", "push", "pull", "checkout", "branch", "merge", "rebase", "log",
176 "diff", "add", "reset", "stash", "fetch", "clone", "init",
177 ],
178 );
179
180 // Cargo commands
181 grammar.add_command("cargo");
182 grammar.add_subcommands(
183 "cargo",
184 &[
185 "build", "run", "test", "check", "clippy", "fmt", "doc", "publish", "new", "init",
186 "add", "remove", "update", "bench", "clean",
187 ],
188 );
189
190 // npm commands
191 grammar.add_command("npm");
192 grammar.add_subcommands(
193 "npm",
194 &[
195 "install", "run", "test", "start", "build", "publish", "init", "update", "audit",
196 "ci", "pack",
197 ],
198 );
199
200 // Docker commands
201 grammar.add_command("docker");
202 grammar.add_subcommands(
203 "docker",
204 &[
205 "run", "build", "push", "pull", "ps", "images", "exec", "stop", "start", "rm",
206 "rmi", "logs", "compose",
207 ],
208 );
209
210 // Common Unix commands
211 for cmd in &[
212 "ls", "cd", "cp", "mv", "rm", "mkdir", "rmdir", "cat", "grep", "find", "chmod",
213 "chown", "curl", "wget", "ssh", "scp", "tar", "zip", "unzip", "make", "python", "node",
214 ] {
215 grammar.add_command(cmd);
216 }
217
218 // Common options
219 grammar.add_common_options(&[
220 "-h",
221 "--help",
222 "-v",
223 "--version",
224 "-q",
225 "--quiet",
226 "-f",
227 "--force",
228 "-r",
229 "--recursive",
230 "-n",
231 "--dry-run",
232 ]);
233
234 grammar
235 }
236
237 /// Add a known command.
238 pub fn add_command(&mut self, command: &str) {
239 self.commands.insert(command.to_string());
240 }
241
242 /// Add subcommands for a command.
243 pub fn add_subcommands(&mut self, command: &str, subs: &[&str]) {
244 let entry = self.subcommands.entry(command.to_string()).or_default();
245 for sub in subs {
246 entry.insert((*sub).to_string());
247 }
248 }
249
250 /// Add common options.
251 pub fn add_common_options(&mut self, options: &[&str]) {
252 for opt in options {
253 self.common_options.insert((*opt).to_string());
254 }
255 }
256
257 /// Check if a command string is valid.
258 ///
259 /// A command is valid if:
260 /// - It's non-empty
261 /// - The first token is a known command OR starts with known command
262 /// - If it has subcommands defined, the second token should be a known subcommand
263 #[must_use]
264 pub fn is_valid_command(&self, command: &str) -> bool {
265 let tokens: Vec<&str> = command.split_whitespace().collect();
266 if tokens.is_empty() {
267 return false;
268 }
269
270 let cmd_name = tokens[0];
271
272 // Check if command is known
273 if !self.commands.contains(cmd_name) {
274 return false;
275 }
276
277 // If we have subcommand definitions and there's a second token
278 if let Some(subs) = self.subcommands.get(cmd_name) {
279 if tokens.len() > 1 {
280 let second = tokens[1];
281 // Allow if it's a known subcommand or starts with '-' (option)
282 if !subs.contains(second) && !second.starts_with('-') {
283 return false;
284 }
285 }
286 }
287
288 true
289 }
290
291 /// Check if a token is a known option.
292 #[must_use]
293 pub fn is_valid_option(&self, option: &str) -> bool {
294 option.starts_with('-') && (self.common_options.contains(option) || option.len() <= 20)
295 }
296
297 /// Get all known commands.
298 #[must_use]
299 pub fn commands(&self) -> &HashSet<String> {
300 &self.commands
301 }
302
303 /// Get subcommands for a command.
304 #[must_use]
305 pub fn get_subcommands(&self, command: &str) -> Option<&HashSet<String>> {
306 self.subcommands.get(command)
307 }
308}
309
310impl Default for ShellGrammar {
311 fn default() -> Self {
312 Self::common_commands()
313 }
314}
315
316// ============================================================================
317// ShellSyntheticGenerator: Generates synthetic shell samples
318// ============================================================================
319
320/// Configuration for shell synthetic generation.
321#[derive(Debug, Clone)]
322pub struct ShellGeneratorConfig {
323 /// Enable template-based generation.
324 pub enable_template: bool,
325 /// Enable argument permutation.
326 pub enable_permutation: bool,
327 /// Enable context variation.
328 pub enable_context_variation: bool,
329 /// Maximum arguments to permute.
330 pub max_permute_args: usize,
331}
332
333impl Default for ShellGeneratorConfig {
334 fn default() -> Self {
335 Self {
336 enable_template: true,
337 enable_permutation: true,
338 enable_context_variation: true,
339 max_permute_args: 3,
340 }
341 }
342}
343
344/// Synthetic data generator for shell autocomplete.
345///
346/// Implements three generation strategies:
347/// 1. Template substitution: Replace arguments with variants
348/// 2. Argument permutation: Reorder and add/remove options
349/// 3. Context variation: Modify history and cwd
350///
351/// # Example
352///
353/// ```
354/// use aprender::synthetic::shell::{ShellSyntheticGenerator, ShellSample};
355/// use aprender::synthetic::{SyntheticGenerator, SyntheticConfig};
356///
357/// let gen = ShellSyntheticGenerator::new();
358/// let seeds = vec![
359/// ShellSample::new("git st", "git status"),
360/// ShellSample::new("cargo b", "cargo build"),
361/// ];
362/// let config = SyntheticConfig::default().with_augmentation_ratio(1.0);
363///
364/// let synthetic = gen.generate(&seeds, &config).expect("generation failed");
365/// assert!(!synthetic.is_empty());
366/// ```
367#[derive(Debug, Clone)]
368pub struct ShellSyntheticGenerator {
369 /// Grammar for command validation.
370 grammar: ShellGrammar,
371 /// Generator configuration.
372 config: ShellGeneratorConfig,
373 /// Argument substitutions: arg -> [variants].
374 substitutions: HashMap<String, Vec<String>>,
375}
376
377include!("shell_generator_impl.rs");
378include!("shell_tests.rs");