1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
/// Data quality pipeline subcommands (powered by alimentar).
///
/// Thin CLI wrappers around alimentar's data utilities.
#[derive(Subcommand, Debug)]
pub enum DataCommands {
/// Audit a JSONL classification dataset for quality issues
Audit {
/// Path to JSONL data file
#[arg(value_name = "FILE")]
file: PathBuf,
/// Number of output classes (for label range validation)
#[arg(long, default_value = "5")]
num_classes: usize,
/// Input text column name
#[arg(long, default_value = "input")]
input_column: String,
/// Label column name
#[arg(long, default_value = "label")]
label_column: String,
/// Preamble prefix to detect (e.g., "#!/")
#[arg(long, default_value = "#!/")]
preamble_prefix: Option<String>,
},
/// Stratified train/val/test split preserving class proportions
Split {
/// Path to JSONL data file
#[arg(value_name = "FILE")]
file: PathBuf,
/// Training set fraction
#[arg(long, default_value = "0.8")]
train: f64,
/// Validation set fraction
#[arg(long, default_value = "0.1")]
val: f64,
/// Test set fraction
#[arg(long, default_value = "0.1")]
test: f64,
/// Label column name for stratification
#[arg(long, default_value = "label")]
label_column: String,
/// Random seed for deterministic split
#[arg(long, default_value = "42")]
seed: u64,
/// Output directory for split files
#[arg(short, long)]
output: PathBuf,
},
/// Check training data for benchmark contamination via n-gram overlap
Decontaminate {
/// Path to training JSONL data file
#[arg(value_name = "FILE")]
file: PathBuf,
/// Reference benchmark JSONL files to check against
#[arg(long, required = true, num_args = 1..)]
reference: Vec<PathBuf>,
/// N-gram size for overlap detection
#[arg(long, default_value = "10")]
ngram: usize,
/// Overlap threshold (0.0-1.0) above which a sample is flagged
#[arg(long, default_value = "0.5")]
threshold: f64,
/// Output as JSON
#[arg(long)]
json: bool,
},
/// Resample dataset to address class imbalance
Balance {
/// Path to JSONL data file
#[arg(value_name = "FILE")]
file: PathBuf,
/// Rebalancing strategy: oversample, undersample, sqrt-inverse
#[arg(long, default_value = "oversample")]
strategy: String,
/// Label column name
#[arg(long, default_value = "label")]
label_column: String,
/// Number of classes (for sqrt-inverse weight computation)
#[arg(long)]
num_classes: Option<usize>,
/// Random seed
#[arg(long, default_value = "42")]
seed: u64,
/// Output file path (required for oversample/undersample)
#[arg(short, long)]
output: Option<PathBuf>,
},
}