Skip to main content

apr_cli/
tokenize_commands.rs

1
2/// Tokenizer training pipeline subcommands (forjar-style plan/apply).
3///
4/// Thin CLI wrappers around aprender's BPE training infrastructure.
5/// Trains a BPE vocabulary from a text corpus for use in model training.
6#[derive(Subcommand, Debug)]
7pub enum TokenizeCommands {
8    /// Validate inputs and estimate tokenizer training time/resources.
9    ///
10    /// Checks that the input corpus exists, counts lines/bytes, estimates
11    /// vocabulary coverage, and reports expected training time. Outputs a
12    /// serializable plan manifest (text, JSON, or YAML).
13    ///
14    /// Analogous to `forjar plan` — shows what will happen before committing.
15    Plan {
16        /// Path to training corpus (text file, one document per line)
17        #[arg(long, value_name = "FILE")]
18        data: PathBuf,
19        /// Target vocabulary size
20        #[arg(long, default_value = "32000")]
21        vocab_size: usize,
22        /// Tokenizer algorithm: bpe, wordpiece, unigram
23        #[arg(long, default_value = "bpe")]
24        algorithm: String,
25        /// Output directory for trained tokenizer
26        #[arg(short, long, default_value = "./tokenizer-output")]
27        output: PathBuf,
28        /// Output format: text, json, yaml
29        #[arg(long, default_value = "text")]
30        format: String,
31    },
32
33    /// Train a tokenizer on the corpus.
34    ///
35    /// Reads the input corpus, trains a BPE/WordPiece/Unigram tokenizer,
36    /// and writes vocab.json + merges.txt to the output directory.
37    ///
38    /// Analogous to `forjar apply` — commits resources and executes the plan.
39    Apply {
40        /// Path to training corpus (text file, one document per line)
41        #[arg(long, value_name = "FILE")]
42        data: PathBuf,
43        /// Target vocabulary size
44        #[arg(long, default_value = "32000")]
45        vocab_size: usize,
46        /// Tokenizer algorithm: bpe, wordpiece, unigram
47        #[arg(long, default_value = "bpe")]
48        algorithm: String,
49        /// Output directory for trained tokenizer
50        #[arg(short, long, default_value = "./tokenizer-output")]
51        output: PathBuf,
52        /// Maximum number of lines to read from corpus (0 = all)
53        #[arg(long, default_value = "0")]
54        max_lines: usize,
55    },
56
57    /// Train BPE on a JSONL corpus per contracts/tokenizer-bpe-v1.yaml (MODEL-2).
58    ///
59    /// Walks `--corpus` (file or directory of `.jsonl` files), extracting the
60    /// `content` field from each line, applies `--normalization` (NFC default),
61    /// and trains a BPE tokenizer with the target vocab size. Writes
62    /// `vocab.json` (token→id) and `merges.txt` (one `a b` pair per line, in
63    /// merge order) to `--output`.
64    Train {
65        /// Path to corpus: a `.jsonl` file or a directory containing `.jsonl` files.
66        /// Each line must be a JSON object with a `content` field.
67        #[arg(long, value_name = "PATH")]
68        corpus: PathBuf,
69        /// Target vocabulary size
70        #[arg(long, default_value = "50000")]
71        vocab_size: usize,
72        /// Minimum frequency a byte-pair must reach before BPE merges it into
73        /// a new vocabulary token (honored by `entrenar::tokenizer::BPETokenizer`
74        /// per task #103). Pairs below this threshold are left unmerged —
75        /// contract INV-TOK-002 of `contracts/tokenizer-bpe-v1.yaml`.
76        #[arg(long, default_value = "2")]
77        min_frequency: usize,
78        /// Output directory; will contain vocab.json and merges.txt.
79        #[arg(long, default_value = "./tokenizer-output")]
80        output: PathBuf,
81        /// Unicode normalization form applied to each document before training.
82        #[arg(long, default_value = "nfc")]
83        normalization: String,
84    },
85
86    /// Encode a JSONL corpus into `.bin` shards per contracts/pretokenize-bin-v1.yaml.
87    ///
88    /// Loads a trained BPE tokenizer (vocab.json + merges.txt) from `--tokenizer`,
89    /// reads `--corpus` (file or directory of `.jsonl` files), encodes the
90    /// `--content-field` of each line to u32 tokens, and writes
91    /// `shard-NNNN.bin` files (flat little-endian u32 streams) into `--output`.
92    /// The output format is precisely what `ShardBatchIter` (aprender-train)
93    /// expects at MODEL-2 pretrain read time.
94    ///
95    /// Root-cause fix for the pretokenize-to-bin gap documented in
96    /// memory/project_shard_reader_bin_format.md — replaces a Python shim
97    /// that was flagged as MUDA on 2026-04-19.
98    EncodeCorpus {
99        /// Path to JSONL corpus file or directory of `.jsonl` files.
100        #[arg(long, value_name = "PATH")]
101        corpus: PathBuf,
102        /// Directory containing vocab.json + merges.txt from `apr tokenize train`.
103        #[arg(long, value_name = "DIR")]
104        tokenizer: PathBuf,
105        /// Output directory for shard-NNNN.bin + manifest.json.
106        #[arg(long, value_name = "DIR")]
107        output: PathBuf,
108        /// Target tokens per shard (shard closes once this limit is reached).
109        #[arg(long, default_value = "10000000")]
110        shard_tokens: usize,
111        /// JSONL field to encode (default: `content`).
112        #[arg(long, default_value = "content")]
113        content_field: String,
114        /// Unicode normalization (must match tokenizer training).
115        #[arg(long, default_value = "nfc")]
116        normalization: String,
117        /// EOS insertion policy: none|between|after.
118        #[arg(long, default_value = "between")]
119        eos_policy: String,
120    },
121}