language-tokenizer 0.3.0

Text tokenizer for linguistic purposes, such as text matching. Supports more than 40 languages, including English, French, Russian, Japanese, Thai etc.
Documentation
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.

[package]
edition = "2024"
rust-version = "1.88.0"
name = "language-tokenizer"
version = "0.3.0"
authors = ["savannstm <savannstm@gmail.com>"]
build = false
autolib = false
autobins = false
autoexamples = false
autotests = false
autobenches = false
description = "Text tokenizer for linguistic purposes, such as text matching. Supports more than 40 languages, including English, French, Russian, Japanese, Thai etc."
homepage = "https://crates.io/crates/language-tokenizer"
documentation = "https://docs.rs/language-tokenizer"
readme = "README.md"
keywords = [
    "tokenizer",
    "text",
    "linguistic",
    "language",
    "segmenter",
]
license-file = "LICENSE.md"
repository = "https://github.com/savannstm/language-tokenizer"

[features]
chinese-icu = [
    "dep:icu_segmenter",
    "dep:itertools",
    "icu_segmenter/compiled_data",
    "icu_segmenter/auto",
]
chinese-lindera = [
    "dep:lindera",
    "lindera/embed-cc-cedict",
]
full = [
    "southeast-asian",
    "snowball",
    "japanese-ipadic-neologd-lindera",
    "chinese-lindera",
    "korean-lindera",
]
japanese-icu = [
    "dep:icu_segmenter",
    "dep:itertools",
    "icu_segmenter/compiled_data",
    "icu_segmenter/auto",
]
japanese-ipadic-lindera = [
    "dep:lindera",
    "lindera/embed-ipadic",
]
japanese-ipadic-neologd-lindera = [
    "dep:lindera",
    "lindera/embed-ipadic-neologd",
]
japanese-unidic-lindera = [
    "dep:lindera",
    "lindera/embed-unidic",
]
korean-lindera = [
    "dep:lindera",
    "lindera/embed-ko-dic",
]
serde = ["dep:serde"]
snowball = [
    "dep:waken_snowball",
    "dep:unicode-normalization",
    "dep:unicode-segmentation",
]
southeast-asian = [
    "dep:icu_segmenter",
    "dep:itertools",
    "icu_segmenter/compiled_data",
    "icu_segmenter/lstm",
]

[lib]
name = "language_tokenizer"
path = "src/lib.rs"

[[test]]
name = "test"
path = "tests/test.rs"

[dependencies.icu_segmenter]
version = "2.2.0"
optional = true

[dependencies.itertools]
version = "0.14.0"
optional = true

[dependencies.lindera]
version = "3.0.5"
optional = true

[dependencies.num_enum]
version = "0.7.6"

[dependencies.serde]
version = "1.0.228"
optional = true

[dependencies.strsim]
version = "0.11.1"

[dependencies.strum]
version = "0.28.0"
features = ["strum_macros"]

[dependencies.strum_macros]
version = "0.28.0"

[dependencies.thiserror]
version = "2.0.18"

[dependencies.unicode-normalization]
version = "0.1.25"
optional = true

[dependencies.unicode-segmentation]
version = "1.13.2"
optional = true

[dependencies.waken_snowball]
version = "0.1.0"
optional = true