[package]
name = "bbpe"
version = "0.6.3"
edition = "2021"
authors = ["Binary BPE Maintainers <michael.bommarito@gmail.com>"]
description = "Binary byte pair encoding (BPE) trainer and CLI compatible with Hugging Face tokenizers"
license = "Apache-2.0"
readme = "README.md"
repository = "https://github.com/mjbommar/binary-bpe"
homepage = "https://github.com/mjbommar/binary-bpe"
documentation = "https://docs.rs/bbpe"
keywords = ["bpe", "binary", "malware", "tokenizer", "huggingface"]
categories = ["encoding", "text-processing"]
rust-version = "1.74"
exclude = [
"/target",
"/*.json",
"/tokenizer*.json",
"/**/*.swp",
"/**/*.rs.bk",
"/test_data",
"/.gitignore",
"/.github",
"/experiments",
"/references/tokenizers",
"/scripts",
]
[features]
default = ["cli"]
cli = ["clap", "indicatif", "env_logger"]
[lib]
name = "bbpe"
path = "src/lib.rs"
[[bin]]
name = "bbpe"
path = "src/bin/bbpe.rs"
required-features = ["cli"]
[dependencies]
anyhow = "1.0"
rayon = "1.10"
rustc-hash = "2.1"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
thiserror = "2.0"
tokenizers = { version = "0.22.1", default-features = true, features = ["onig"] }
walkdir = "2.5"
log = "0.4"
ahash = "0.8.11"
blake3 = "1.5"
bstr = "1.9"
rand = "0.8"
flate2 = "1.0"
clap = { version = "4.5", features = ["derive"], optional = true }
indicatif = { version = "0.18", features = ["rayon"], optional = true }
env_logger = { version = "0.11", optional = true }
[dev-dependencies]
assert_cmd = "2.0"
criterion = { version = "0.7", default-features = false, features = ["cargo_bench_support"] }
insta = { version = "1.41", features = ["yaml"] }
tempfile = "3.12"
flate2 = "1.0"
[build-dependencies]
[package.metadata.docs.rs]
all-features = true
[[bench]]
name = "training"
harness = false