Skip to main content

apr_cli/
tokenize_commands.rs

1
2/// Tokenizer training pipeline subcommands (forjar-style plan/apply).
3///
4/// Thin CLI wrappers around aprender's BPE training infrastructure.
5/// Trains a BPE vocabulary from a text corpus for use in model training.
6#[derive(Subcommand, Debug)]
7pub enum TokenizeCommands {
8    /// Validate inputs and estimate tokenizer training time/resources.
9    ///
10    /// Checks that the input corpus exists, counts lines/bytes, estimates
11    /// vocabulary coverage, and reports expected training time. Outputs a
12    /// serializable plan manifest (text, JSON, or YAML).
13    ///
14    /// Analogous to `forjar plan` — shows what will happen before committing.
15    Plan {
16        /// Path to training corpus (text file, one document per line)
17        #[arg(long, value_name = "FILE")]
18        data: PathBuf,
19        /// Target vocabulary size
20        #[arg(long, default_value = "32000")]
21        vocab_size: usize,
22        /// Tokenizer algorithm: bpe, wordpiece, unigram
23        #[arg(long, default_value = "bpe")]
24        algorithm: String,
25        /// Output directory for trained tokenizer
26        #[arg(short, long, default_value = "./tokenizer-output")]
27        output: PathBuf,
28        /// Output format: text, json, yaml
29        #[arg(long, default_value = "text")]
30        format: String,
31    },
32
33    /// Train a tokenizer on the corpus.
34    ///
35    /// Reads the input corpus, trains a BPE/WordPiece/Unigram tokenizer,
36    /// and writes vocab.json + merges.txt to the output directory.
37    ///
38    /// Analogous to `forjar apply` — commits resources and executes the plan.
39    Apply {
40        /// Path to training corpus (text file, one document per line)
41        #[arg(long, value_name = "FILE")]
42        data: PathBuf,
43        /// Target vocabulary size
44        #[arg(long, default_value = "32000")]
45        vocab_size: usize,
46        /// Tokenizer algorithm: bpe, wordpiece, unigram
47        #[arg(long, default_value = "bpe")]
48        algorithm: String,
49        /// Output directory for trained tokenizer
50        #[arg(short, long, default_value = "./tokenizer-output")]
51        output: PathBuf,
52        /// Maximum number of lines to read from corpus (0 = all)
53        #[arg(long, default_value = "0")]
54        max_lines: usize,
55    },
56}