1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
/// Tokenizer training pipeline subcommands (forjar-style plan/apply).
///
/// Thin CLI wrappers around aprender's BPE training infrastructure.
/// Trains a BPE vocabulary from a text corpus for use in model training.
#[derive(Subcommand, Debug)]
pub enum TokenizeCommands {
/// Validate inputs and estimate tokenizer training time/resources.
///
/// Checks that the input corpus exists, counts lines/bytes, estimates
/// vocabulary coverage, and reports expected training time. Outputs a
/// serializable plan manifest (text, JSON, or YAML).
///
/// Analogous to `forjar plan` — shows what will happen before committing.
Plan {
/// Path to training corpus (text file, one document per line)
#[arg(long, value_name = "FILE")]
data: PathBuf,
/// Target vocabulary size
#[arg(long, default_value = "32000")]
vocab_size: usize,
/// Tokenizer algorithm: bpe, wordpiece, unigram
#[arg(long, default_value = "bpe")]
algorithm: String,
/// Output directory for trained tokenizer
#[arg(short, long, default_value = "./tokenizer-output")]
output: PathBuf,
/// Output format: text, json, yaml
#[arg(long, default_value = "text")]
format: String,
},
/// Train a tokenizer on the corpus.
///
/// Reads the input corpus, trains a BPE/WordPiece/Unigram tokenizer,
/// and writes vocab.json + merges.txt to the output directory.
///
/// Analogous to `forjar apply` — commits resources and executes the plan.
Apply {
/// Path to training corpus (text file, one document per line)
#[arg(long, value_name = "FILE")]
data: PathBuf,
/// Target vocabulary size
#[arg(long, default_value = "32000")]
vocab_size: usize,
/// Tokenizer algorithm: bpe, wordpiece, unigram
#[arg(long, default_value = "bpe")]
algorithm: String,
/// Output directory for trained tokenizer
#[arg(short, long, default_value = "./tokenizer-output")]
output: PathBuf,
/// Maximum number of lines to read from corpus (0 = all)
#[arg(long, default_value = "0")]
max_lines: usize,
},
/// Train BPE on a JSONL corpus per contracts/tokenizer-bpe-v1.yaml (MODEL-2).
///
/// Walks `--corpus` (file or directory of `.jsonl` files), extracting the
/// `content` field from each line, applies `--normalization` (NFC default),
/// and trains a BPE tokenizer with the target vocab size. Writes
/// `vocab.json` (token→id) and `merges.txt` (one `a b` pair per line, in
/// merge order) to `--output`.
Train {
/// Path to corpus: a `.jsonl` file or a directory containing `.jsonl` files.
/// Each line must be a JSON object with a `content` field.
#[arg(long, value_name = "PATH")]
corpus: PathBuf,
/// Target vocabulary size
#[arg(long, default_value = "50000")]
vocab_size: usize,
/// Minimum frequency a byte-pair must reach before BPE merges it into
/// a new vocabulary token (honored by `entrenar::tokenizer::BPETokenizer`
/// per task #103). Pairs below this threshold are left unmerged —
/// contract INV-TOK-002 of `contracts/tokenizer-bpe-v1.yaml`.
#[arg(long, default_value = "2")]
min_frequency: usize,
/// Output directory; will contain vocab.json and merges.txt.
#[arg(long, default_value = "./tokenizer-output")]
output: PathBuf,
/// Unicode normalization form applied to each document before training.
#[arg(long, default_value = "nfc")]
normalization: String,
},
/// Encode a JSONL corpus into `.bin` shards per contracts/pretokenize-bin-v1.yaml.
///
/// Loads a trained BPE tokenizer (vocab.json + merges.txt) from `--tokenizer`,
/// reads `--corpus` (file or directory of `.jsonl` files), encodes the
/// `--content-field` of each line to u32 tokens, and writes
/// `shard-NNNN.bin` files (flat little-endian u32 streams) into `--output`.
/// The output format is precisely what `ShardBatchIter` (aprender-train)
/// expects at MODEL-2 pretrain read time.
///
/// Root-cause fix for the pretokenize-to-bin gap documented in
/// memory/project_shard_reader_bin_format.md — replaces a Python shim
/// that was flagged as MUDA on 2026-04-19.
EncodeCorpus {
/// Path to JSONL corpus file or directory of `.jsonl` files.
#[arg(long, value_name = "PATH")]
corpus: PathBuf,
/// Directory containing vocab.json + merges.txt from `apr tokenize train`.
#[arg(long, value_name = "DIR")]
tokenizer: PathBuf,
/// Output directory for shard-NNNN.bin + manifest.json.
#[arg(long, value_name = "DIR")]
output: PathBuf,
/// Target tokens per shard (shard closes once this limit is reached).
#[arg(long, default_value = "10000000")]
shard_tokens: usize,
/// JSONL field to encode (default: `content`).
#[arg(long, default_value = "content")]
content_field: String,
/// Unicode normalization (must match tokenizer training).
#[arg(long, default_value = "nfc")]
normalization: String,
/// EOS insertion policy: none|between|after.
#[arg(long, default_value = "between")]
eos_policy: String,
},
}