apr_cli/
tokenize_commands.rs

1/// Tokenizer training pipeline subcommands (forjar-style plan/apply).
2///
3/// Thin CLI wrappers around aprender's BPE training infrastructure.
4/// Trains a BPE vocabulary from a text corpus for use in model training.
5#[derive(Subcommand, Debug)]
6pub enum TokenizeCommands {
7    /// Validate inputs and estimate tokenizer training time/resources.
8    ///
9    /// Checks that the input corpus exists, counts lines/bytes, estimates
10    /// vocabulary coverage, and reports expected training time. Outputs a
11    /// serializable plan manifest (text, JSON, or YAML).
12    ///
13    /// Analogous to `forjar plan` — shows what will happen before committing.
14    Plan {
15        /// Path to training corpus (text file, one document per line)
16        #[arg(long, value_name = "FILE")]
17        data: PathBuf,
18        /// Target vocabulary size
19        #[arg(long, default_value = "32000")]
20        vocab_size: usize,
21        /// Tokenizer algorithm: bpe, wordpiece, unigram
22        #[arg(long, default_value = "bpe")]
23        algorithm: String,
24        /// Output directory for trained tokenizer
25        #[arg(short, long, default_value = "./tokenizer-output")]
26        output: PathBuf,
27        /// Output format: text, json, yaml
28        #[arg(long, default_value = "text")]
29        format: String,
30    },
31
32    /// Train a tokenizer on the corpus.
33    ///
34    /// Reads the input corpus, trains a BPE/WordPiece/Unigram tokenizer,
35    /// and writes vocab.json + merges.txt to the output directory.
36    ///
37    /// Analogous to `forjar apply` — commits resources and executes the plan.
38    Apply {
39        /// Path to training corpus (text file, one document per line)
40        #[arg(long, value_name = "FILE")]
41        data: PathBuf,
42        /// Target vocabulary size
43        #[arg(long, default_value = "32000")]
44        vocab_size: usize,
45        /// Tokenizer algorithm: bpe, wordpiece, unigram
46        #[arg(long, default_value = "bpe")]
47        algorithm: String,
48        /// Output directory for trained tokenizer
49        #[arg(short, long, default_value = "./tokenizer-output")]
50        output: PathBuf,
51        /// Maximum number of lines to read from corpus (0 = all)
52        #[arg(long, default_value = "0")]
53        max_lines: usize,
54    },
55
56    /// Train BPE on a JSONL corpus per contracts/tokenizer-bpe-v1.yaml (MODEL-2).
57    ///
58    /// Walks `--corpus` (file or directory of `.jsonl` files), extracting the
59    /// `content` field from each line, applies `--normalization` (NFC default),
60    /// and trains a BPE tokenizer with the target vocab size. Writes
61    /// `vocab.json` (token→id) and `merges.txt` (one `a b` pair per line, in
62    /// merge order) to `--output`.
63    Train {
64        /// Path to corpus: a `.jsonl` file or a directory containing `.jsonl` files.
65        /// Each line must be a JSON object with a `content` field.
66        #[arg(long, value_name = "PATH")]
67        corpus: PathBuf,
68        /// Target vocabulary size. Default 50_257 matches GPT-2 convention
69        /// (50_000 BPE merges + 256 byte-level fallback tokens + 1 sentinel)
70        /// and the MODEL-2 albor tokenizer contract (tokenizer-bpe-v1 v1.2.0).
71        #[arg(long, default_value = "50257")]
72        vocab_size: usize,
73        /// Minimum frequency a byte-pair must reach before BPE merges it into
74        /// a new vocabulary token (honored by `entrenar::tokenizer::BPETokenizer`
75        /// per task #103). Pairs below this threshold are left unmerged —
76        /// contract INV-TOK-002 of `contracts/tokenizer-bpe-v1.yaml`.
77        #[arg(long, default_value = "2")]
78        min_frequency: usize,
79        /// Output directory; will contain vocab.json and merges.txt.
80        #[arg(long, default_value = "./tokenizer-output")]
81        output: PathBuf,
82        /// Unicode normalization form applied to each document before training.
83        #[arg(long, default_value = "nfc")]
84        normalization: String,
85    },
86
87    /// Import a HuggingFace tokenizer.json into aprender's two-file
88    /// vocab.json + merges.txt layout per
89    /// `contracts/apr-cli-tokenize-import-hf-v1.yaml` (§50.4 step 5g.0).
90    ///
91    /// Reads `<INPUT>` (a HF tokenizer.json with `model.type == "BPE"`),
92    /// extracts `model.vocab` → `<OUTPUT>/vocab.json`, `model.merges` →
93    /// `<OUTPUT>/merges.txt` (one space-separated merge per line), and
94    /// writes `<OUTPUT>/manifest.json` with extraction provenance
95    /// (source path, sha256, vocab_size, merges_count, timestamp).
96    ///
97    /// Non-BPE inputs (Unigram, WordPiece) are rejected fail-fast with a
98    /// clear error citing the contract id.
99    ///
100    /// Unblocks fine-tuning from public HF checkpoints (Qwen2.5/Llama2/
101    /// Mistral) which distribute as a single tokenizer.json. The output
102    /// dir is consumable by `apr tokenize encode-corpus --tokenizer <DIR>`
103    /// and `apr pretrain --tokenizer <DIR>` without modification.
104    ImportHf {
105        /// Path to input HuggingFace tokenizer.json (BPE model required).
106        #[arg(long, value_name = "FILE")]
107        input: PathBuf,
108        /// Output directory; will contain vocab.json + merges.txt + manifest.json.
109        #[arg(long, value_name = "DIR")]
110        output: PathBuf,
111        /// Include `added_tokens` in vocab.json (default: BPE state machine only).
112        /// Use this when the downstream consumer needs special tokens (e.g.,
113        /// `<|im_start|>`, `<|endoftext|>`) materialized in vocab.json itself.
114        #[arg(long, default_value_t = false)]
115        include_added_tokens: bool,
116    },
117
118    /// Encode a JSONL corpus into `.bin` shards per contracts/pretokenize-bin-v1.yaml.
119    ///
120    /// Loads a trained BPE tokenizer (vocab.json + merges.txt) from `--tokenizer`,
121    /// reads `--corpus` (file or directory of `.jsonl` files), encodes the
122    /// `--content-field` of each line to u32 tokens, and writes
123    /// `shard-NNNN.bin` files (flat little-endian u32 streams) into `--output`.
124    /// The output format is precisely what `ShardBatchIter` (aprender-train)
125    /// expects at MODEL-2 pretrain read time.
126    ///
127    /// Root-cause fix for the pretokenize-to-bin gap documented in
128    /// memory/project_shard_reader_bin_format.md — replaces a Python shim
129    /// that was flagged as MUDA on 2026-04-19.
130    #[cfg(feature = "training")]
131    EncodeCorpus {
132        /// Path to JSONL corpus file, parquet shard, or directory of `.jsonl`
133        /// or `.parquet` files. Pass `--corpus` multiple times to merge
134        /// multiple sources into a single output corpus (SPEC §83 P2-C —
135        /// see `contracts/corpus-merge-v3-v1.yaml`). When repeated, sources
136        /// are encoded in command-line order and shard numbering is
137        /// continuous across sources.
138        #[arg(long, value_name = "PATH", required = true)]
139        corpus: Vec<PathBuf>,
140        /// Directory containing vocab.json + merges.txt from `apr tokenize train`.
141        #[arg(long, value_name = "DIR")]
142        tokenizer: PathBuf,
143        /// Output directory for shard-NNNN.bin + manifest.json.
144        #[arg(long, value_name = "DIR")]
145        output: PathBuf,
146        /// Target tokens per shard (shard closes once this limit is reached).
147        #[arg(long, default_value = "10000000")]
148        shard_tokens: usize,
149        /// JSONL field to encode (default: `content`).
150        #[arg(long, default_value = "content")]
151        content_field: String,
152        /// Unicode normalization (must match tokenizer training).
153        #[arg(long, default_value = "nfc")]
154        normalization: String,
155        /// EOS insertion policy: none|between|after.
156        #[arg(long, default_value = "between")]
157        eos_policy: String,
158        /// Number of rayon workers for per-document BPE encoding.
159        ///
160        /// Defaults to `std::thread::available_parallelism()` (logical CPU count).
161        /// Set to `1` to force the single-threaded byte-identical legacy path.
162        /// Set to a fixed N to bound memory or share the host with other jobs.
163        ///
164        /// Output shard order is preserved: chunked encoding keeps original
165        /// document order regardless of worker count (issue #1547,
166        /// contracts/apr-tokenize-parallel-bpe-v1.yaml `parallel_correctness`).
167        #[arg(long, value_name = "N")]
168        num_workers: Option<usize>,
169        /// Suppress per-document progress emission to stderr (issue #1547,
170        /// contract v1.2.0). Default: emit a `[progress] doc=N/T tokens=K
171        /// rate=X.X docs/s eta=...` line every `--progress-interval-docs`
172        /// docs OR `--progress-interval-seconds` seconds (whichever fires
173        /// first). Useful for CI / log-scraping callers that prefer silence.
174        #[arg(long, default_value_t = false)]
175        quiet: bool,
176        /// Emit a progress line at most every N docs (default 1000). Pair
177        /// with `--progress-interval-seconds` — whichever bound is reached
178        /// first triggers emission. Issue #1547 contract v1.2.0.
179        #[arg(long, value_name = "N", default_value_t = 1000)]
180        progress_interval_docs: u64,
181        /// Emit a progress line at most every S seconds (default 60). Pair
182        /// with `--progress-interval-docs` — whichever bound is reached
183        /// first triggers emission. Issue #1547 contract v1.2.0.
184        #[arg(long, value_name = "S", default_value_t = 60)]
185        progress_interval_seconds: u64,
186        /// Pre-flight only: estimate total tokens / shards / wall time
187        /// without writing any output. Reads `--estimate-sample-docs`
188        /// (default 1000), encodes them, observes (tokens, wall-time-
189        /// per-doc), and extrapolates against the total document count.
190        /// Emits `[estimate]` lines on stderr; no shards or manifest are
191        /// written. Operator pre-flight gate before multi-day encode
192        /// runs (issue #1547 contract v1.3.0).
193        #[arg(long, default_value_t = false)]
194        estimate_only: bool,
195        /// Number of documents to sample for `--estimate-only`
196        /// extrapolation (default: 1000). Larger samples → tighter
197        /// per-doc rate estimate but longer pre-flight wall.
198        #[arg(long, value_name = "N", default_value_t = 1000)]
199        estimate_sample_docs: u64,
200    },
201
202    /// Reconstruct manifest.json from existing shard-NNNN.bin files.
203    ///
204    /// `apr tokenize encode-corpus` writes manifest.json only on clean
205    /// process exit. If the encoder is killed (operator SIGINT, OOM,
206    /// crash, power loss) AFTER all shards flush but BEFORE manifest
207    /// write, the corpus on disk is consumable by `ShardBatchIter` but
208    /// has no provenance file for ship audit / dashboards.
209    ///
210    /// `repair-manifest` is the cheap recovery path: it scans
211    /// `<OUTPUT>/shard-*.bin`, computes shard_count + total_tokens
212    /// from file sizes (each shard is a flat little-endian u32 stream;
213    /// tokens = file_size / 4), and writes a schema-conforming
214    /// `manifest.json`. Idempotent: runs twice are byte-identical
215    /// modulo `repaired_at` timestamp.
216    ///
217    /// Contract: `contracts/apr-tokenize-repair-manifest-v1.yaml`.
218    /// Motivating instance: SHIP-TWO §56 5g.1 corpus (228 shards
219    /// flushed, manifest missing).
220    #[cfg(feature = "training")]
221    RepairManifest {
222        /// Output directory containing shard-NNNN.bin files.
223        /// `manifest.json` will be written into this directory.
224        #[arg(long, value_name = "DIR")]
225        output: PathBuf,
226        /// Optional tokenizer directory; when provided, `vocab.json`
227        /// is read for the manifest's `vocab_size` field. Without it,
228        /// `vocab_size` is recorded as `null` (provenance-incomplete
229        /// but otherwise valid).
230        #[arg(long, value_name = "DIR")]
231        tokenizer: Option<PathBuf>,
232        /// Emit the manifest body as JSON to stdout (in addition to
233        /// writing to disk).
234        #[arg(long, default_value_t = false)]
235        json: bool,
236    },
237}
apr_cli/tokenize_commands.rs

apr_cli/
tokenize_commands.rs