[package]
edition = "2021"
rust-version = "1.83.0"
name = "language-tokenizer"
version = "0.2.0"
authors = ["savannstm <savannstm@gmail.com>"]
build = false
autolib = false
autobins = false
autoexamples = false
autotests = false
autobenches = false
description = "Text tokenizer for linguistic purposes, such as text matching. Supports more than 40 languages, including English, French, Russian, Japanese, Thai etc."
homepage = "https://crates.io/crates/language-tokenizer"
documentation = "https://docs.rs/language-tokenizer"
readme = "README.md"
keywords = [
"tokenizer",
"text",
"linguistic",
"language",
"segmenter",
]
license-file = "LICENSE.md"
repository = "https://github.com/savannstm/language-tokenizer"
[features]
chinese-icu = [
"dep:icu_segmenter",
"dep:itertools",
"icu_segmenter/compiled_data",
"icu_segmenter/auto",
]
chinese-lindera = [
"dep:lindera",
"lindera/embed-cc-cedict",
]
full = [
"southeast-asian",
"snowball",
"japanese-ipadic-neologd-lindera",
"chinese-lindera",
"korean-lindera",
]
japanese-icu = [
"dep:icu_segmenter",
"dep:itertools",
"icu_segmenter/compiled_data",
"icu_segmenter/auto",
]
japanese-ipadic-lindera = [
"dep:lindera",
"lindera/embed-ipadic",
]
japanese-ipadic-neologd-lindera = [
"dep:lindera",
"lindera/embed-ipadic-neologd",
]
japanese-unidic-lindera = [
"dep:lindera",
"lindera/embed-unidic",
]
korean-lindera = [
"dep:lindera",
"lindera/embed-ko-dic",
]
serde = ["dep:serde"]
snowball = [
"dep:waken_snowball",
"dep:unicode-normalization",
"dep:unicode-segmentation",
]
southeast-asian = [
"dep:icu_segmenter",
"dep:itertools",
"icu_segmenter/compiled_data",
"icu_segmenter/lstm",
]
[lib]
name = "language_tokenizer"
path = "src/lib.rs"
[[test]]
name = "test"
path = "tests/test.rs"
[dependencies.icu_segmenter]
version = "2.2.0"
optional = true
[dependencies.itertools]
version = "0.14.0"
optional = true
[dependencies.lindera]
version = "3.0.4"
optional = true
[dependencies.num_enum]
version = "0.7.6"
[dependencies.serde]
version = "1.0.228"
optional = true
[dependencies.strsim]
version = "0.11.1"
[dependencies.strum]
version = "0.28.0"
features = ["strum_macros"]
[dependencies.strum_macros]
version = "0.28.0"
[dependencies.thiserror]
version = "2.0.18"
[dependencies.unicode-normalization]
version = "0.1.25"
optional = true
[dependencies.unicode-segmentation]
version = "1.13.2"
optional = true
[dependencies.waken_snowball]
version = "0.1.0"
optional = true