apr_cli/data_commands.rs
1
2/// Data quality pipeline subcommands (powered by alimentar).
3///
4/// Thin CLI wrappers around alimentar's data utilities.
5#[derive(Subcommand, Debug)]
6pub enum DataCommands {
7 /// Audit a JSONL classification dataset for quality issues
8 Audit {
9 /// Path to JSONL data file
10 #[arg(value_name = "FILE")]
11 file: PathBuf,
12 /// Number of output classes (for label range validation)
13 #[arg(long, default_value = "5")]
14 num_classes: usize,
15 /// Input text column name
16 #[arg(long, default_value = "input")]
17 input_column: String,
18 /// Label column name
19 #[arg(long, default_value = "label")]
20 label_column: String,
21 /// Preamble prefix to detect (e.g., "#!/")
22 #[arg(long, default_value = "#!/")]
23 preamble_prefix: Option<String>,
24 },
25 /// Stratified train/val/test split preserving class proportions
26 Split {
27 /// Path to JSONL data file
28 #[arg(value_name = "FILE")]
29 file: PathBuf,
30 /// Training set fraction
31 #[arg(long, default_value = "0.8")]
32 train: f64,
33 /// Validation set fraction
34 #[arg(long, default_value = "0.1")]
35 val: f64,
36 /// Test set fraction
37 #[arg(long, default_value = "0.1")]
38 test: f64,
39 /// Label column name for stratification
40 #[arg(long, default_value = "label")]
41 label_column: String,
42 /// Random seed for deterministic split
43 #[arg(long, default_value = "42")]
44 seed: u64,
45 /// Output directory for split files
46 #[arg(short, long)]
47 output: PathBuf,
48 },
49 /// Check training data for benchmark contamination via n-gram overlap
50 Decontaminate {
51 /// Path to training JSONL data file
52 #[arg(value_name = "FILE")]
53 file: PathBuf,
54 /// Reference benchmark JSONL files to check against
55 #[arg(long, required = true, num_args = 1..)]
56 reference: Vec<PathBuf>,
57 /// N-gram size for overlap detection
58 #[arg(long, default_value = "10")]
59 ngram: usize,
60 /// Overlap threshold (0.0-1.0) above which a sample is flagged
61 #[arg(long, default_value = "0.5")]
62 threshold: f64,
63 /// Output as JSON
64 #[arg(long)]
65 json: bool,
66 },
67 /// Resample dataset to address class imbalance
68 Balance {
69 /// Path to JSONL data file
70 #[arg(value_name = "FILE")]
71 file: PathBuf,
72 /// Rebalancing strategy: oversample, undersample, sqrt-inverse
73 #[arg(long, default_value = "oversample")]
74 strategy: String,
75 /// Label column name
76 #[arg(long, default_value = "label")]
77 label_column: String,
78 /// Number of classes (for sqrt-inverse weight computation)
79 #[arg(long)]
80 num_classes: Option<usize>,
81 /// Random seed
82 #[arg(long, default_value = "42")]
83 seed: u64,
84 /// Output file path (required for oversample/undersample)
85 #[arg(short, long)]
86 output: Option<PathBuf>,
87 },
88}