apr_cli/tokenize_commands.rs
1
2/// Tokenizer training pipeline subcommands (forjar-style plan/apply).
3///
4/// Thin CLI wrappers around aprender's BPE training infrastructure.
5/// Trains a BPE vocabulary from a text corpus for use in model training.
6#[derive(Subcommand, Debug)]
7pub enum TokenizeCommands {
8 /// Validate inputs and estimate tokenizer training time/resources.
9 ///
10 /// Checks that the input corpus exists, counts lines/bytes, estimates
11 /// vocabulary coverage, and reports expected training time. Outputs a
12 /// serializable plan manifest (text, JSON, or YAML).
13 ///
14 /// Analogous to `forjar plan` — shows what will happen before committing.
15 Plan {
16 /// Path to training corpus (text file, one document per line)
17 #[arg(long, value_name = "FILE")]
18 data: PathBuf,
19 /// Target vocabulary size
20 #[arg(long, default_value = "32000")]
21 vocab_size: usize,
22 /// Tokenizer algorithm: bpe, wordpiece, unigram
23 #[arg(long, default_value = "bpe")]
24 algorithm: String,
25 /// Output directory for trained tokenizer
26 #[arg(short, long, default_value = "./tokenizer-output")]
27 output: PathBuf,
28 /// Output format: text, json, yaml
29 #[arg(long, default_value = "text")]
30 format: String,
31 },
32
33 /// Train a tokenizer on the corpus.
34 ///
35 /// Reads the input corpus, trains a BPE/WordPiece/Unigram tokenizer,
36 /// and writes vocab.json + merges.txt to the output directory.
37 ///
38 /// Analogous to `forjar apply` — commits resources and executes the plan.
39 Apply {
40 /// Path to training corpus (text file, one document per line)
41 #[arg(long, value_name = "FILE")]
42 data: PathBuf,
43 /// Target vocabulary size
44 #[arg(long, default_value = "32000")]
45 vocab_size: usize,
46 /// Tokenizer algorithm: bpe, wordpiece, unigram
47 #[arg(long, default_value = "bpe")]
48 algorithm: String,
49 /// Output directory for trained tokenizer
50 #[arg(short, long, default_value = "./tokenizer-output")]
51 output: PathBuf,
52 /// Maximum number of lines to read from corpus (0 = all)
53 #[arg(long, default_value = "0")]
54 max_lines: usize,
55 },
56
57 /// Train BPE on a JSONL corpus per contracts/tokenizer-bpe-v1.yaml (MODEL-2).
58 ///
59 /// Walks `--corpus` (file or directory of `.jsonl` files), extracting the
60 /// `content` field from each line, applies `--normalization` (NFC default),
61 /// and trains a BPE tokenizer with the target vocab size. Writes
62 /// `vocab.json` (token→id) and `merges.txt` (one `a b` pair per line, in
63 /// merge order) to `--output`.
64 Train {
65 /// Path to corpus: a `.jsonl` file or a directory containing `.jsonl` files.
66 /// Each line must be a JSON object with a `content` field.
67 #[arg(long, value_name = "PATH")]
68 corpus: PathBuf,
69 /// Target vocabulary size
70 #[arg(long, default_value = "50000")]
71 vocab_size: usize,
72 /// Minimum frequency a byte-pair must reach before BPE merges it into
73 /// a new vocabulary token (honored by `entrenar::tokenizer::BPETokenizer`
74 /// per task #103). Pairs below this threshold are left unmerged —
75 /// contract INV-TOK-002 of `contracts/tokenizer-bpe-v1.yaml`.
76 #[arg(long, default_value = "2")]
77 min_frequency: usize,
78 /// Output directory; will contain vocab.json and merges.txt.
79 #[arg(long, default_value = "./tokenizer-output")]
80 output: PathBuf,
81 /// Unicode normalization form applied to each document before training.
82 #[arg(long, default_value = "nfc")]
83 normalization: String,
84 },
85
86 /// Encode a JSONL corpus into `.bin` shards per contracts/pretokenize-bin-v1.yaml.
87 ///
88 /// Loads a trained BPE tokenizer (vocab.json + merges.txt) from `--tokenizer`,
89 /// reads `--corpus` (file or directory of `.jsonl` files), encodes the
90 /// `--content-field` of each line to u32 tokens, and writes
91 /// `shard-NNNN.bin` files (flat little-endian u32 streams) into `--output`.
92 /// The output format is precisely what `ShardBatchIter` (aprender-train)
93 /// expects at MODEL-2 pretrain read time.
94 ///
95 /// Root-cause fix for the pretokenize-to-bin gap documented in
96 /// memory/project_shard_reader_bin_format.md — replaces a Python shim
97 /// that was flagged as MUDA on 2026-04-19.
98 EncodeCorpus {
99 /// Path to JSONL corpus file or directory of `.jsonl` files.
100 #[arg(long, value_name = "PATH")]
101 corpus: PathBuf,
102 /// Directory containing vocab.json + merges.txt from `apr tokenize train`.
103 #[arg(long, value_name = "DIR")]
104 tokenizer: PathBuf,
105 /// Output directory for shard-NNNN.bin + manifest.json.
106 #[arg(long, value_name = "DIR")]
107 output: PathBuf,
108 /// Target tokens per shard (shard closes once this limit is reached).
109 #[arg(long, default_value = "10000000")]
110 shard_tokens: usize,
111 /// JSONL field to encode (default: `content`).
112 #[arg(long, default_value = "content")]
113 content_field: String,
114 /// Unicode normalization (must match tokenizer training).
115 #[arg(long, default_value = "nfc")]
116 normalization: String,
117 /// EOS insertion policy: none|between|after.
118 #[arg(long, default_value = "between")]
119 eos_policy: String,
120 },
121}