[dependencies.icu_segmenter]
optional = true
version = "2.1.2"
[dependencies.itertools]
optional = true
version = "0.14.0"
[dependencies.lindera]
optional = true
version = "1.5.1"
[dependencies.num_enum]
version = "0.7.5"
[dependencies.serde]
optional = true
version = "1.0.228"
[dependencies.strsim]
version = "0.11.1"
[dependencies.strum]
features = ["strum_macros"]
version = "0.27.2"
[dependencies.strum_macros]
version = "0.27.2"
[dependencies.thiserror]
version = "2.0.17"
[dependencies.unicode-normalization]
optional = true
version = "0.1.25"
[dependencies.unicode-segmentation]
optional = true
version = "1.12.0"
[dependencies.waken_snowball]
optional = true
version = "0.1.0"
[features]
chinese-icu = ["dep:icu_segmenter", "dep:itertools", "icu_segmenter/compiled_data", "icu_segmenter/auto"]
chinese-lindera = ["dep:lindera", "lindera/embedded-cc-cedict"]
full = ["southeast-asian", "snowball", "japanese-ipadic-neologd-lindera", "chinese-lindera", "korean-lindera"]
japanese-icu = ["dep:icu_segmenter", "dep:itertools", "icu_segmenter/compiled_data", "icu_segmenter/auto"]
japanese-ipadic-lindera = ["dep:lindera", "lindera/embedded-ipadic"]
japanese-ipadic-neologd-lindera = ["dep:lindera", "lindera/embedded-ipadic-neologd"]
japanese-unidic-lindera = ["dep:lindera", "lindera/embedded-unidic"]
korean-lindera = ["dep:lindera", "lindera/embedded-ko-dic"]
serde = ["dep:serde"]
snowball = ["dep:waken_snowball", "dep:unicode-normalization", "dep:unicode-segmentation"]
southeast-asian = ["dep:icu_segmenter", "dep:itertools", "icu_segmenter/compiled_data", "icu_segmenter/lstm"]
[lib]
name = "language_tokenizer"
path = "src/lib.rs"
[package]
authors = ["savannstm <savannstm@gmail.com>"]
autobenches = false
autobins = false
autoexamples = false
autolib = false
autotests = false
build = false
description = "Text tokenizer for linguistic purposes, such as text matching. Supports more than 40 languages, including English, French, Russian, Japanese, Thai etc."
documentation = "https://docs.rs/language-tokenizer"
edition = "2021"
homepage = "https://crates.io/crates/language-tokenizer"
keywords = ["tokenizer", "text", "linguistic", "language", "segmenter"]
license-file = "LICENSE.md"
name = "language-tokenizer"
readme = "README.md"
repository = "https://github.com/savannstm/language-tokenizer"
rust-version = "1.83.0"
version = "0.1.0"
[[test]]
name = "test"
path = "tests/test.rs"