Skip to main content

kham_core/
lib.rs

1//! # kham-core
2//!
3//! Pure Rust Thai word segmentation engine. `no_std` compatible (requires `alloc`).
4//!
5//! ## Quick start
6//!
7//! ```rust
8//! use kham_core::Tokenizer;
9//!
10//! let tokenizer = Tokenizer::new();
11//! let tokens = tokenizer.segment("กินข้าวกับปลา");
12//! for token in &tokens {
13//!     println!("{} ({:?})", token.text, token.kind);
14//! }
15//! ```
16#![no_std]
17#![forbid(unsafe_code)]
18#![warn(missing_docs)]
19
20extern crate alloc;
21
22pub mod abbrev;
23pub mod date;
24pub mod dict;
25pub mod error;
26pub mod freq;
27pub mod fts;
28pub mod ne;
29pub mod ngram;
30pub mod normalizer;
31pub mod number;
32pub mod pos;
33pub mod pre_tokenizer;
34pub mod romanizer;
35pub mod segmenter;
36pub mod sentence;
37pub mod soundex;
38pub mod stopwords;
39pub mod synonym;
40pub mod tcc;
41pub mod token;
42
43pub use error::KhamError;
44pub use segmenter::{Tokenizer, TokenizerBuilder};
45pub use token::{NamedEntityKind, Token, TokenKind};
46
47/// Decompress zlib-compressed built-in data produced by the build script.
48pub(crate) fn decompress_builtin(data: &[u8]) -> alloc::string::String {
49    let bytes = miniz_oxide::inflate::decompress_to_vec_zlib(data)
50        .expect("built-in data decompression failed");
51    alloc::string::String::from_utf8(bytes).expect("built-in data is valid UTF-8")
52}