kham_core/lib.rs
1//! # kham-core
2//!
3//! Pure Rust Thai word segmentation engine. `no_std` compatible (requires `alloc`).
4//!
5//! ## Quick start
6//!
7//! ```rust
8//! use kham_core::Tokenizer;
9//!
10//! let tokenizer = Tokenizer::new();
11//! let tokens = tokenizer.segment("กินข้าวกับปลา");
12//! for token in &tokens {
13//! println!("{} ({:?})", token.text, token.kind);
14//! }
15//! ```
16//!
17//! ### Mixed script
18//!
19//! Non-Thai spans (Latin, numbers, emoji) pass through unchanged alongside Thai tokens:
20//!
21//! ```rust
22//! use kham_core::{Tokenizer, TokenKind};
23//!
24//! let tok = Tokenizer::new();
25//! let tokens = tok.segment("ธนาคาร100แห่ง");
26//! assert_eq!(tokens[1].text, "100");
27//! assert_eq!(tokens[1].kind, TokenKind::Number);
28//! // char_span is suitable for Python/JS string indexing
29//! assert_eq!(tokens[0].char_span, 0..6); // ธนาคาร = 6 chars
30//! assert_eq!(tokens[1].char_span, 6..9); // 100 = 3 chars
31//! ```
32//!
33//! ### Custom dictionary
34//!
35//! Merge extra words with the built-in dictionary using the builder:
36//!
37//! ```rust
38//! use kham_core::Tokenizer;
39//!
40//! let tok = Tokenizer::builder()
41//! .dict_words("ปัญญาประดิษฐ์\n")
42//! .build();
43//! let tokens = tok.segment("ปัญญาประดิษฐ์คือ");
44//! assert!(tokens.iter().any(|t| t.text == "ปัญญาประดิษฐ์"));
45//! ```
46//!
47//! ### Normalize then segment
48//!
49//! [`Tokenizer::segment`] is zero-copy. For input with stacked tone marks or
50//! สระลอย in wrong order, normalize into a new `String` first, then borrow it:
51//!
52//! ```rust
53//! use kham_core::Tokenizer;
54//!
55//! let tok = Tokenizer::new();
56//! let normalized = tok.normalize("กเินข้าว"); // reorder สระลอย
57//! let tokens = tok.segment(&normalized); // tokens borrow `normalized`
58//! assert!(!tokens.is_empty());
59//! ```
60//!
61//! ### Full-text search pipeline
62//!
63//! [`fts::FtsTokenizer`] wraps the segmenter with stopword tagging, synonym
64//! expansion, POS tags, and named-entity recognition in one call:
65//!
66//! ```rust
67//! use kham_core::fts::FtsTokenizer;
68//!
69//! let fts = FtsTokenizer::new();
70//!
71//! // All tokens with metadata (position, kind, stopword flag, synonyms, …)
72//! let tokens = fts.segment_for_fts("กินข้าวกับปลา");
73//! assert!(tokens.iter().any(|t| t.text == "กับ" && t.is_stop));
74//!
75//! // Only indexable tokens (stopwords excluded, positions preserved)
76//! let indexed = fts.index_tokens("กินข้าวกับปลา");
77//! assert!(indexed.iter().all(|t| !t.is_stop));
78//!
79//! // Flat list of lexeme strings ready for a tsvector
80//! let lexemes = fts.lexemes("กินข้าวกับปลา");
81//! assert!(lexemes.iter().any(|l| l == "กิน" || l == "ปลา"));
82//! ```
83#![no_std]
84#![forbid(unsafe_code)]
85#![warn(missing_docs)]
86
87extern crate alloc;
88
89pub mod abbrev;
90pub mod date;
91pub mod dict;
92pub mod error;
93pub mod freq;
94pub mod fts;
95pub mod ne;
96pub mod ngram;
97pub mod normalizer;
98pub mod number;
99pub mod pos;
100pub mod pre_tokenizer;
101pub mod romanizer;
102pub mod segmenter;
103pub mod sentence;
104pub mod soundex;
105pub mod stopwords;
106pub mod synonym;
107pub mod tcc;
108pub mod token;
109
110pub use error::KhamError;
111pub use segmenter::{Tokenizer, TokenizerBuilder};
112pub use token::{NamedEntityKind, Token, TokenKind};
113
114/// Decompress zlib-compressed built-in data produced by the build script.
115pub(crate) fn decompress_builtin(data: &[u8]) -> alloc::string::String {
116 let bytes = miniz_oxide::inflate::decompress_to_vec_zlib(data)
117 .expect("built-in data decompression failed");
118 alloc::string::String::from_utf8(bytes).expect("built-in data is valid UTF-8")
119}