apr_cli/tokenize_commands.rs
1/// Tokenizer training pipeline subcommands (forjar-style plan/apply).
2///
3/// Thin CLI wrappers around aprender's BPE training infrastructure.
4/// Trains a BPE vocabulary from a text corpus for use in model training.
5#[derive(Subcommand, Debug)]
6pub enum TokenizeCommands {
7 /// Validate inputs and estimate tokenizer training time/resources.
8 ///
9 /// Checks that the input corpus exists, counts lines/bytes, estimates
10 /// vocabulary coverage, and reports expected training time. Outputs a
11 /// serializable plan manifest (text, JSON, or YAML).
12 ///
13 /// Analogous to `forjar plan` — shows what will happen before committing.
14 Plan {
15 /// Path to training corpus (text file, one document per line)
16 #[arg(long, value_name = "FILE")]
17 data: PathBuf,
18 /// Target vocabulary size
19 #[arg(long, default_value = "32000")]
20 vocab_size: usize,
21 /// Tokenizer algorithm: bpe, wordpiece, unigram
22 #[arg(long, default_value = "bpe")]
23 algorithm: String,
24 /// Output directory for trained tokenizer
25 #[arg(short, long, default_value = "./tokenizer-output")]
26 output: PathBuf,
27 /// Output format: text, json, yaml
28 #[arg(long, default_value = "text")]
29 format: String,
30 },
31
32 /// Train a tokenizer on the corpus.
33 ///
34 /// Reads the input corpus, trains a BPE/WordPiece/Unigram tokenizer,
35 /// and writes vocab.json + merges.txt to the output directory.
36 ///
37 /// Analogous to `forjar apply` — commits resources and executes the plan.
38 Apply {
39 /// Path to training corpus (text file, one document per line)
40 #[arg(long, value_name = "FILE")]
41 data: PathBuf,
42 /// Target vocabulary size
43 #[arg(long, default_value = "32000")]
44 vocab_size: usize,
45 /// Tokenizer algorithm: bpe, wordpiece, unigram
46 #[arg(long, default_value = "bpe")]
47 algorithm: String,
48 /// Output directory for trained tokenizer
49 #[arg(short, long, default_value = "./tokenizer-output")]
50 output: PathBuf,
51 /// Maximum number of lines to read from corpus (0 = all)
52 #[arg(long, default_value = "0")]
53 max_lines: usize,
54 },
55
56 /// Train BPE on a JSONL corpus per contracts/tokenizer-bpe-v1.yaml (MODEL-2).
57 ///
58 /// Walks `--corpus` (file or directory of `.jsonl` files), extracting the
59 /// `content` field from each line, applies `--normalization` (NFC default),
60 /// and trains a BPE tokenizer with the target vocab size. Writes
61 /// `vocab.json` (token→id) and `merges.txt` (one `a b` pair per line, in
62 /// merge order) to `--output`.
63 Train {
64 /// Path to corpus: a `.jsonl` file or a directory containing `.jsonl` files.
65 /// Each line must be a JSON object with a `content` field.
66 #[arg(long, value_name = "PATH")]
67 corpus: PathBuf,
68 /// Target vocabulary size. Default 50_257 matches GPT-2 convention
69 /// (50_000 BPE merges + 256 byte-level fallback tokens + 1 sentinel)
70 /// and the MODEL-2 albor tokenizer contract (tokenizer-bpe-v1 v1.2.0).
71 #[arg(long, default_value = "50257")]
72 vocab_size: usize,
73 /// Minimum frequency a byte-pair must reach before BPE merges it into
74 /// a new vocabulary token (honored by `entrenar::tokenizer::BPETokenizer`
75 /// per task #103). Pairs below this threshold are left unmerged —
76 /// contract INV-TOK-002 of `contracts/tokenizer-bpe-v1.yaml`.
77 #[arg(long, default_value = "2")]
78 min_frequency: usize,
79 /// Output directory; will contain vocab.json and merges.txt.
80 #[arg(long, default_value = "./tokenizer-output")]
81 output: PathBuf,
82 /// Unicode normalization form applied to each document before training.
83 #[arg(long, default_value = "nfc")]
84 normalization: String,
85 },
86
87 /// Import a HuggingFace tokenizer.json into aprender's two-file
88 /// vocab.json + merges.txt layout per
89 /// `contracts/apr-cli-tokenize-import-hf-v1.yaml` (§50.4 step 5g.0).
90 ///
91 /// Reads `<INPUT>` (a HF tokenizer.json with `model.type == "BPE"`),
92 /// extracts `model.vocab` → `<OUTPUT>/vocab.json`, `model.merges` →
93 /// `<OUTPUT>/merges.txt` (one space-separated merge per line), and
94 /// writes `<OUTPUT>/manifest.json` with extraction provenance
95 /// (source path, sha256, vocab_size, merges_count, timestamp).
96 ///
97 /// Non-BPE inputs (Unigram, WordPiece) are rejected fail-fast with a
98 /// clear error citing the contract id.
99 ///
100 /// Unblocks fine-tuning from public HF checkpoints (Qwen2.5/Llama2/
101 /// Mistral) which distribute as a single tokenizer.json. The output
102 /// dir is consumable by `apr tokenize encode-corpus --tokenizer <DIR>`
103 /// and `apr pretrain --tokenizer <DIR>` without modification.
104 ImportHf {
105 /// Path to input HuggingFace tokenizer.json (BPE model required).
106 #[arg(long, value_name = "FILE")]
107 input: PathBuf,
108 /// Output directory; will contain vocab.json + merges.txt + manifest.json.
109 #[arg(long, value_name = "DIR")]
110 output: PathBuf,
111 /// Include `added_tokens` in vocab.json (default: BPE state machine only).
112 /// Use this when the downstream consumer needs special tokens (e.g.,
113 /// `<|im_start|>`, `<|endoftext|>`) materialized in vocab.json itself.
114 #[arg(long, default_value_t = false)]
115 include_added_tokens: bool,
116 },
117
118 /// Encode a JSONL corpus into `.bin` shards per contracts/pretokenize-bin-v1.yaml.
119 ///
120 /// Loads a trained BPE tokenizer (vocab.json + merges.txt) from `--tokenizer`,
121 /// reads `--corpus` (file or directory of `.jsonl` files), encodes the
122 /// `--content-field` of each line to u32 tokens, and writes
123 /// `shard-NNNN.bin` files (flat little-endian u32 streams) into `--output`.
124 /// The output format is precisely what `ShardBatchIter` (aprender-train)
125 /// expects at MODEL-2 pretrain read time.
126 ///
127 /// Root-cause fix for the pretokenize-to-bin gap documented in
128 /// memory/project_shard_reader_bin_format.md — replaces a Python shim
129 /// that was flagged as MUDA on 2026-04-19.
130 #[cfg(feature = "training")]
131 EncodeCorpus {
132 /// Path to JSONL corpus file or directory of `.jsonl` files.
133 #[arg(long, value_name = "PATH")]
134 corpus: PathBuf,
135 /// Directory containing vocab.json + merges.txt from `apr tokenize train`.
136 #[arg(long, value_name = "DIR")]
137 tokenizer: PathBuf,
138 /// Output directory for shard-NNNN.bin + manifest.json.
139 #[arg(long, value_name = "DIR")]
140 output: PathBuf,
141 /// Target tokens per shard (shard closes once this limit is reached).
142 #[arg(long, default_value = "10000000")]
143 shard_tokens: usize,
144 /// JSONL field to encode (default: `content`).
145 #[arg(long, default_value = "content")]
146 content_field: String,
147 /// Unicode normalization (must match tokenizer training).
148 #[arg(long, default_value = "nfc")]
149 normalization: String,
150 /// EOS insertion policy: none|between|after.
151 #[arg(long, default_value = "between")]
152 eos_policy: String,
153 },
154}