bbpe 0.6.3

Binary byte pair encoding (BPE) trainer and CLI compatible with Hugging Face tokenizers
Documentation
[package]
name = "bbpe"
version = "0.6.3"
edition = "2021"
authors = ["Binary BPE Maintainers <michael.bommarito@gmail.com>"]
description = "Binary byte pair encoding (BPE) trainer and CLI compatible with Hugging Face tokenizers"
license = "Apache-2.0"
readme = "README.md"
repository = "https://github.com/mjbommar/binary-bpe"
homepage = "https://github.com/mjbommar/binary-bpe"
documentation = "https://docs.rs/bbpe"
keywords = ["bpe", "binary", "malware", "tokenizer", "huggingface"]
categories = ["encoding", "text-processing"]
rust-version = "1.74"
exclude = [
    "/target",
    "/*.json",
    "/tokenizer*.json",
    "/**/*.swp",
    "/**/*.rs.bk",
    "/test_data",
    "/.gitignore",
    "/.github",
    "/experiments",
    "/references/tokenizers",
    "/scripts",
]

[features]
default = ["cli"]
cli = ["clap", "indicatif", "env_logger"]

[lib]
name = "bbpe"
path = "src/lib.rs"

[[bin]]
name = "bbpe"
path = "src/bin/bbpe.rs"
required-features = ["cli"]

[dependencies]
anyhow = "1.0"
rayon = "1.10"
rustc-hash = "2.1"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
thiserror = "2.0"
tokenizers = { version = "0.22.1", default-features = true, features = ["onig"] }
walkdir = "2.5"
log = "0.4"
ahash = "0.8.11"
blake3 = "1.5"
bstr = "1.9"
rand = "0.8"
flate2 = "1.0"

clap = { version = "4.5", features = ["derive"], optional = true }
indicatif = { version = "0.18", features = ["rayon"], optional = true }
env_logger = { version = "0.11", optional = true }

[dev-dependencies]
assert_cmd = "2.0"
criterion = { version = "0.7", default-features = false, features = ["cargo_bench_support"] }
insta = { version = "1.41", features = ["yaml"] }
tempfile = "3.12"
flate2 = "1.0"

[build-dependencies]

[package.metadata.docs.rs]
all-features = true

[[bench]]
name = "training"
harness = false