Skip to main content

apr_cli/
data_commands.rs

1
2/// Data quality pipeline subcommands (powered by alimentar).
3///
4/// Thin CLI wrappers around alimentar's data utilities.
5#[derive(Subcommand, Debug)]
6pub enum DataCommands {
7    /// Audit a JSONL classification dataset for quality issues
8    Audit {
9        /// Path to JSONL data file
10        #[arg(value_name = "FILE")]
11        file: PathBuf,
12        /// Number of output classes (for label range validation)
13        #[arg(long, default_value = "5")]
14        num_classes: usize,
15        /// Input text column name
16        #[arg(long, default_value = "input")]
17        input_column: String,
18        /// Label column name
19        #[arg(long, default_value = "label")]
20        label_column: String,
21        /// Preamble prefix to detect (e.g., "#!/")
22        #[arg(long, default_value = "#!/")]
23        preamble_prefix: Option<String>,
24    },
25    /// Stratified train/val/test split preserving class proportions
26    Split {
27        /// Path to JSONL data file
28        #[arg(value_name = "FILE")]
29        file: PathBuf,
30        /// Training set fraction
31        #[arg(long, default_value = "0.8")]
32        train: f64,
33        /// Validation set fraction
34        #[arg(long, default_value = "0.1")]
35        val: f64,
36        /// Test set fraction
37        #[arg(long, default_value = "0.1")]
38        test: f64,
39        /// Label column name for stratification
40        #[arg(long, default_value = "label")]
41        label_column: String,
42        /// Random seed for deterministic split
43        #[arg(long, default_value = "42")]
44        seed: u64,
45        /// Output directory for split files
46        #[arg(short, long)]
47        output: PathBuf,
48    },
49    /// Check training data for benchmark contamination via n-gram overlap
50    Decontaminate {
51        /// Path to training JSONL data file
52        #[arg(value_name = "FILE")]
53        file: PathBuf,
54        /// Reference benchmark JSONL files to check against
55        #[arg(long, required = true, num_args = 1..)]
56        reference: Vec<PathBuf>,
57        /// N-gram size for overlap detection
58        #[arg(long, default_value = "10")]
59        ngram: usize,
60        /// Overlap threshold (0.0-1.0) above which a sample is flagged
61        #[arg(long, default_value = "0.5")]
62        threshold: f64,
63        /// Output as JSON
64        #[arg(long)]
65        json: bool,
66    },
67    /// Resample dataset to address class imbalance
68    Balance {
69        /// Path to JSONL data file
70        #[arg(value_name = "FILE")]
71        file: PathBuf,
72        /// Rebalancing strategy: oversample, undersample, sqrt-inverse
73        #[arg(long, default_value = "oversample")]
74        strategy: String,
75        /// Label column name
76        #[arg(long, default_value = "label")]
77        label_column: String,
78        /// Number of classes (for sqrt-inverse weight computation)
79        #[arg(long)]
80        num_classes: Option<usize>,
81        /// Random seed
82        #[arg(long, default_value = "42")]
83        seed: u64,
84        /// Output file path (required for oversample/undersample)
85        #[arg(short, long)]
86        output: Option<PathBuf>,
87    },
88}