[package]
name = "kitoken"
description = "Fast and versatile tokenizer for language models, supporting BPE, Unigram and WordPiece tokenization"
version = "0.10.1"
authors = ["Christian Sdunek <me@systemcluster.me>"]
keywords = ["tokenizer", "nlp", "bpe", "unigram", "wordpiece"]
categories = ["text-processing", "algorithms", "wasm", "no-std", "parser-implementations"]
license = "BSD-2-Clause"
homepage = "https://kitoken.dev"
repository = "https://github.com/Systemcluster/kitoken"
readme = "README.md"
edition = "2021"
resolver = "2"
rust-version = "1.82.0"
include = [
"Cargo.toml",
"src/**/*",
"LICENCE",
]
[workspace]
members = [
".",
"packages/python",
"packages/javascript",
]
[lib]
name = "kitoken"
path = "src/lib.rs"
[features]
default = [
"std",
"serialization",
"normalization",
"convert",
"regex-perf",
"multiversion",
]
all = [
"std",
"serialization",
"normalization",
"convert",
"split",
"regex-unicode",
"regex-perf",
"multiversion",
]
std = ["dep:thiserror", "orx-priority-queue/std", "memchr/std", "multiversion?/std"]
serialization = ["dep:serde", "dep:postcard"]
normalization = ["normalization-unicode", "normalization-charsmap"]
normalization-unicode = ["dep:unicode-normalization"]
normalization-charsmap = ["bstr/unicode"]
split = ["split-unicode-script"]
split-unicode-script = ["dep:unicode-script"]
convert = ["convert-tiktoken", "convert-sentencepiece", "convert-tokenizers", "convert-tekken", "convert-detect"]
convert-tokenizers = ["dep:base64", "dep:serde", "dep:serde_json", "hashbrown/serde"]
convert-sentencepiece = ["dep:sentencepiece-model"]
convert-tiktoken = ["dep:base64"]
convert-tekken = ["dep:base64", "dep:serde", "dep:serde_json"]
convert-detect = ["serialization"]
regex-unicode = ["fancy-regex/unicode"]
regex-perf = ["fancy-regex/perf"]
regex-onig = ["dep:onig"]
multiversion = ["dep:multiversion"]
unstable = []
[dependencies]
bstr = { version = "1.10", default-features = false, features = ["alloc"] }
log = { version = "0.4" }
fancy-regex = { version = "0.14", default-features = false }
regex-automata = { version = "0.4", default-features = false, features = [
"alloc",
"syntax",
"meta",
"nfa",
"dfa",
"hybrid",
"unicode-perl",
"unicode-gencat",
"unicode-case",
] }
regex-syntax = { version = "0.8", default-features = false, features = [
"unicode-perl",
"unicode-gencat",
"unicode-case",
] }
hashbrown = { version = "0.15", default-features = false, features = ["default-hasher", "inline-more", "allocator-api2"] }
once_cell = { version = "1.19", default-features = false, features = ["alloc", "race"] }
orx-priority-queue = { version = "1.4", default-features = false }
memchr = { version = "2.7", default-features = false, features = ["alloc"] }
derive_more = { version = "1.0", default-features = false, features = ["deref", "deref_mut", "as_ref", "index", "index_mut"] }
thiserror = { version = "1.0", optional = true }
serde = { version = "1.0", default-features = false, features = ["alloc", "derive"], optional = true }
postcard = { version = "1.0", default-features = false, features = ["alloc"], optional = true }
onig = { version = "6.4", default-features = false, optional = true }
unicode-normalization = { version = "0.1", optional = true }
unicode-script = { version = "0.5", optional = true }
base64 = { version = "0.22", default-features = false, features = ["alloc"], optional = true }
sentencepiece-model = { version = "0.1", default-features = false, optional = true }
serde_json = { version = "1.0", default-features = false, features = ["alloc"], optional = true }
multiversion = { version = "0.8", default-features = false, optional = true }
[dev-dependencies]
kitoken = { path = ".", default-features = false, features = [
"std",
"serialization",
"convert",
"normalization",
"split",
"regex-perf",
"multiversion",
] }
console = { version = "0.15", features = ["windows-console-colors"] }
simple_logger = { version = "5.0" }
criterion = { version = "0.5", default-features = false, features = ["cargo_bench_support"] }
[package.metadata.docs.rs]
rustdoc-args = ["--cfg", "docsrs"]
[profile.release]
codegen-units = 1
debug = false
debug-assertions = false
incremental = false
lto = "fat"
opt-level = 's'
overflow-checks = false
panic = "abort"
strip = "symbols"
[profile.release.build-override]
opt-level = 's'
[profile.performance]
inherits = "release"
opt-level = 3
[profile.performance.build-override]
opt-level = 3
[profile.bench]
inherits = "release"
opt-level = 3
debug = true
strip = "none"
[profile.bench.build-override]
opt-level = 3
[profile.test]
inherits = "dev"
opt-level = 1
[[bench]]
name = "encode_cl100k"
path = "benches/bench_encode_cl100k.rs"
harness = false
[[bench]]
name = "encode_gpt2"
path = "benches/bench_encode_gpt2.rs"
harness = false
[[bench]]
name = "encode_llama2"
path = "benches/bench_encode_llama2.rs"
harness = false
[[bench]]
name = "encode_xlnet"
path = "benches/bench_encode_xlnet.rs"
harness = false