apr_cli/tokenize_commands.rs
1
2/// Tokenizer training pipeline subcommands (forjar-style plan/apply).
3///
4/// Thin CLI wrappers around aprender's BPE training infrastructure.
5/// Trains a BPE vocabulary from a text corpus for use in model training.
6#[derive(Subcommand, Debug)]
7pub enum TokenizeCommands {
8 /// Validate inputs and estimate tokenizer training time/resources.
9 ///
10 /// Checks that the input corpus exists, counts lines/bytes, estimates
11 /// vocabulary coverage, and reports expected training time. Outputs a
12 /// serializable plan manifest (text, JSON, or YAML).
13 ///
14 /// Analogous to `forjar plan` — shows what will happen before committing.
15 Plan {
16 /// Path to training corpus (text file, one document per line)
17 #[arg(long, value_name = "FILE")]
18 data: PathBuf,
19 /// Target vocabulary size
20 #[arg(long, default_value = "32000")]
21 vocab_size: usize,
22 /// Tokenizer algorithm: bpe, wordpiece, unigram
23 #[arg(long, default_value = "bpe")]
24 algorithm: String,
25 /// Output directory for trained tokenizer
26 #[arg(short, long, default_value = "./tokenizer-output")]
27 output: PathBuf,
28 /// Output format: text, json, yaml
29 #[arg(long, default_value = "text")]
30 format: String,
31 },
32
33 /// Train a tokenizer on the corpus.
34 ///
35 /// Reads the input corpus, trains a BPE/WordPiece/Unigram tokenizer,
36 /// and writes vocab.json + merges.txt to the output directory.
37 ///
38 /// Analogous to `forjar apply` — commits resources and executes the plan.
39 Apply {
40 /// Path to training corpus (text file, one document per line)
41 #[arg(long, value_name = "FILE")]
42 data: PathBuf,
43 /// Target vocabulary size
44 #[arg(long, default_value = "32000")]
45 vocab_size: usize,
46 /// Tokenizer algorithm: bpe, wordpiece, unigram
47 #[arg(long, default_value = "bpe")]
48 algorithm: String,
49 /// Output directory for trained tokenizer
50 #[arg(short, long, default_value = "./tokenizer-output")]
51 output: PathBuf,
52 /// Maximum number of lines to read from corpus (0 = all)
53 #[arg(long, default_value = "0")]
54 max_lines: usize,
55 },
56}