language-tokenizer 0.1.0

Text tokenizer for linguistic purposes, such as text matching. Supports more than 40 languages, including English, French, Russian, Japanese, Thai etc.
Documentation
[package]
name = "language-tokenizer"
version = "0.1.0"
authors = ["savannstm <savannstm@gmail.com>"]
edition = "2021"
description = "Text tokenizer for linguistic purposes, such as text matching. Supports more than 40 languages, including English, French, Russian, Japanese, Thai etc."
documentation = "https://docs.rs/language-tokenizer"
homepage = "https://crates.io/crates/language-tokenizer"
repository = "https://github.com/savannstm/language-tokenizer"
readme = "README.md"
rust-version = "1.83.0"
license-file = "LICENSE.md"
keywords = ["tokenizer", "text", "linguistic", "language", "segmenter"]

[dependencies]
icu_segmenter = { version = "2.1.2", optional = true }
itertools = { version = "0.14.0", optional = true }
lindera = { version = "1.5.1", optional = true }
serde = { version = "1.0.228", optional = true }
strsim = "0.11.1"
strum = { version = "0.27.2", features = ["strum_macros"] }
strum_macros = "0.27.2"
thiserror = "2.0.17"
unicode-normalization = { version = "0.1.25", optional = true }
unicode-segmentation = { version = "1.12.0", optional = true }
waken_snowball = { version = "0.1.0", optional = true }
num_enum = "0.7.5"

[features]
full = ["southeast-asian", "snowball", "japanese-ipadic-neologd-lindera", "chinese-lindera", "korean-lindera"]

snowball = ["dep:waken_snowball", "dep:unicode-normalization", "dep:unicode-segmentation"]

japanese-ipadic-neologd-lindera = ["dep:lindera", "lindera/embedded-ipadic-neologd"]
japanese-ipadic-lindera = ["dep:lindera", "lindera/embedded-ipadic"]
japanese-unidic-lindera = ["dep:lindera", "lindera/embedded-unidic"]
chinese-lindera = ["dep:lindera", "lindera/embedded-cc-cedict"]
korean-lindera = ["dep:lindera", "lindera/embedded-ko-dic"]

japanese-icu = ["dep:icu_segmenter", "dep:itertools", "icu_segmenter/compiled_data", "icu_segmenter/auto"]
chinese-icu = ["dep:icu_segmenter", "dep:itertools", "icu_segmenter/compiled_data", "icu_segmenter/auto"]

southeast-asian = ["dep:icu_segmenter", "dep:itertools", "icu_segmenter/compiled_data", "icu_segmenter/lstm"]

serde = ["dep:serde"]