Skip to main content

kiwi_rs/
types.rs

1//! Public data types used by high-level kiwi-rs APIs.
2//!
3//! Unless stated otherwise, offset fields in this module are character-based
4//! indices (`str.chars()`), not UTF-8 byte offsets.
5
6use std::env;
7use std::os::raw::c_int;
8use std::path::{Path, PathBuf};
9
10use crate::constants::{
11    KIWI_BUILD_DEFAULT, KIWI_DIALECT_STANDARD, KIWI_MATCH_ALL_WITH_NORMALIZING,
12};
13use crate::discovery::discover_default_model_path;
14use crate::error::{KiwiError, Result};
15
16/// A user dictionary entry consumed by [`crate::KiwiBuilder::add_user_words`].
17#[derive(Debug, Clone)]
18pub struct UserWord {
19    /// Surface form to add.
20    pub word: String,
21    /// Part-of-speech tag for the word.
22    pub tag: String,
23    /// User score used by Kiwi during ranking.
24    pub score: f32,
25}
26
27impl UserWord {
28    /// Creates a user dictionary entry.
29    pub fn new(word: impl Into<String>, tag: impl Into<String>, score: f32) -> Self {
30        Self {
31            word: word.into(),
32            tag: tag.into(),
33            score,
34        }
35    }
36}
37
38/// Options for `analyze*` and `tokenize*` APIs.
39///
40/// Most flag values come from constants re-exported by this crate
41/// (`KIWI_MATCH_*`, `KIWI_DIALECT_*`).
42#[derive(Debug, Clone, Copy)]
43pub struct AnalyzeOptions {
44    /// Number of candidate analyses to return.
45    pub top_n: usize,
46    /// Bit flags controlling token matching behavior.
47    pub match_options: i32,
48    /// Enables open-ended analysis mode.
49    pub open_ending: bool,
50    /// Allowed dialect bit mask.
51    pub allowed_dialects: i32,
52    /// Penalty used when selecting dialectal analyses.
53    pub dialect_cost: f32,
54}
55
56impl Default for AnalyzeOptions {
57    fn default() -> Self {
58        Self {
59            top_n: 1,
60            match_options: KIWI_MATCH_ALL_WITH_NORMALIZING,
61            open_ending: false,
62            allowed_dialects: KIWI_DIALECT_STANDARD,
63            dialect_cost: 3.0,
64        }
65    }
66}
67
68impl AnalyzeOptions {
69    /// Sets the number of candidates.
70    pub fn with_top_n(mut self, top_n: usize) -> Self {
71        self.top_n = top_n;
72        self
73    }
74
75    /// Sets `match_options` bit flags.
76    pub fn with_match_options(mut self, match_options: i32) -> Self {
77        self.match_options = match_options;
78        self
79    }
80
81    /// Enables or disables open-ending analysis.
82    pub fn with_open_ending(mut self, open_ending: bool) -> Self {
83        self.open_ending = open_ending;
84        self
85    }
86
87    /// Sets allowed dialect bit flags.
88    pub fn with_allowed_dialects(mut self, allowed_dialects: i32) -> Self {
89        self.allowed_dialects = allowed_dialects;
90        self
91    }
92
93    /// Sets dialect mismatch penalty.
94    pub fn with_dialect_cost(mut self, dialect_cost: f32) -> Self {
95        self.dialect_cost = dialect_cost;
96        self
97    }
98
99    pub(crate) fn validated_top_n(&self) -> Result<c_int> {
100        if self.top_n == 0 {
101            return Err(KiwiError::InvalidArgument(
102                "AnalyzeOptions.top_n must be >= 1".to_string(),
103            ));
104        }
105        if self.top_n > c_int::MAX as usize {
106            return Err(KiwiError::InvalidArgument(format!(
107                "AnalyzeOptions.top_n must be <= {}",
108                c_int::MAX
109            )));
110        }
111        Ok(self.top_n as c_int)
112    }
113}
114
115/// Builder-time configuration for constructing a [`crate::Kiwi`] instance.
116#[derive(Debug, Clone)]
117pub struct BuilderConfig {
118    /// Model root directory (for example `.../models/cong/base`).
119    pub model_path: Option<PathBuf>,
120    /// Number of worker threads. `-1` follows Kiwi defaults.
121    pub num_threads: i32,
122    /// Kiwi build option bit flags (`KIWI_BUILD_*`).
123    pub build_options: i32,
124    /// Enabled dialect bit mask.
125    pub enabled_dialects: i32,
126    /// Cost threshold used when typo model is applied.
127    pub typo_cost_threshold: f32,
128}
129
130impl Default for BuilderConfig {
131    fn default() -> Self {
132        Self {
133            model_path: discover_default_model_path(),
134            num_threads: -1,
135            build_options: KIWI_BUILD_DEFAULT,
136            enabled_dialects: KIWI_DIALECT_STANDARD,
137            typo_cost_threshold: 0.0,
138        }
139    }
140}
141
142impl BuilderConfig {
143    /// Sets model path.
144    pub fn with_model_path(mut self, model_path: impl AsRef<Path>) -> Self {
145        self.model_path = Some(model_path.as_ref().to_path_buf());
146        self
147    }
148
149    /// Sets worker thread count.
150    pub fn with_num_threads(mut self, num_threads: i32) -> Self {
151        self.num_threads = num_threads;
152        self
153    }
154
155    /// Sets build option flags.
156    pub fn with_build_options(mut self, build_options: i32) -> Self {
157        self.build_options = build_options;
158        self
159    }
160
161    /// Sets enabled dialect bit mask.
162    pub fn with_enabled_dialects(mut self, enabled_dialects: i32) -> Self {
163        self.enabled_dialects = enabled_dialects;
164        self
165    }
166
167    /// Sets typo cost threshold.
168    pub fn with_typo_cost_threshold(mut self, typo_cost_threshold: f32) -> Self {
169        self.typo_cost_threshold = typo_cost_threshold;
170        self
171    }
172}
173
174/// Top-level configuration used by [`crate::Kiwi::from_config`].
175#[derive(Debug, Clone)]
176pub struct KiwiConfig {
177    /// Dynamic library path. Defaults to `KIWI_LIBRARY_PATH` env var.
178    pub library_path: Option<PathBuf>,
179    /// Builder-related options.
180    pub builder: BuilderConfig,
181    /// Default analysis options applied by convenience APIs.
182    pub default_analyze_options: AnalyzeOptions,
183    /// User dictionary entries inserted during initialization.
184    pub user_words: Vec<UserWord>,
185}
186
187impl Default for KiwiConfig {
188    fn default() -> Self {
189        Self {
190            library_path: env::var_os("KIWI_LIBRARY_PATH").map(PathBuf::from),
191            builder: BuilderConfig::default(),
192            default_analyze_options: AnalyzeOptions::default(),
193            user_words: Vec::new(),
194        }
195    }
196}
197
198impl KiwiConfig {
199    /// Sets dynamic library path.
200    pub fn with_library_path(mut self, library_path: impl AsRef<Path>) -> Self {
201        self.library_path = Some(library_path.as_ref().to_path_buf());
202        self
203    }
204
205    /// Sets model path inside [`Self::builder`].
206    pub fn with_model_path(mut self, model_path: impl AsRef<Path>) -> Self {
207        self.builder = self.builder.with_model_path(model_path);
208        self
209    }
210
211    /// Replaces builder config.
212    pub fn with_builder(mut self, builder: BuilderConfig) -> Self {
213        self.builder = builder;
214        self
215    }
216
217    /// Replaces default analysis options.
218    pub fn with_default_analyze_options(mut self, options: AnalyzeOptions) -> Self {
219        self.default_analyze_options = options;
220        self
221    }
222
223    /// Adds one user dictionary entry.
224    pub fn add_user_word(
225        mut self,
226        word: impl Into<String>,
227        tag: impl Into<String>,
228        score: f32,
229    ) -> Self {
230        self.user_words.push(UserWord::new(word, tag, score));
231        self
232    }
233}
234
235/// A single morpheme token produced by Kiwi analysis.
236#[derive(Debug, Clone)]
237pub struct Token {
238    /// Surface form.
239    pub form: String,
240    /// Part-of-speech tag string.
241    pub tag: String,
242    /// Character-based start offset in the original UTF-8 text (`str.chars()`).
243    pub position: usize,
244    /// Character length (`str.chars()` count), not byte length.
245    pub length: usize,
246    /// Word index inside the analyzed sentence.
247    pub word_position: usize,
248    /// Sentence index in multi-sentence analysis output.
249    pub sent_position: usize,
250    /// Line number metadata from Kiwi output.
251    pub line_number: usize,
252    /// Sub-sentence index metadata from Kiwi output.
253    pub sub_sent_position: usize,
254    /// Token score from language model.
255    pub score: f32,
256    /// Typo correction cost for this token.
257    pub typo_cost: f32,
258    /// Typo form identifier from Kiwi internals.
259    pub typo_form_id: u32,
260    /// Optional paired-token index (for paired punctuation etc.).
261    pub paired_token: Option<usize>,
262    /// Optional morpheme id for dictionary-backed APIs.
263    pub morpheme_id: Option<u32>,
264    /// Optional numeric tag id.
265    pub tag_id: Option<u8>,
266    /// Optional sense id or script id depending on tag.
267    pub sense_or_script: Option<u8>,
268    /// Optional dialect id.
269    pub dialect: Option<u16>,
270}
271
272/// One analysis candidate, including probability and token list.
273#[derive(Debug, Clone)]
274pub struct AnalysisCandidate {
275    /// Candidate probability score.
276    pub probability: f32,
277    /// Token sequence for this candidate.
278    pub tokens: Vec<Token>,
279}
280
281/// Alias kept for readability in user code.
282pub type Analysis = AnalysisCandidate;
283
284/// Sentence split result used by `split_into_sents*_with_options`.
285#[derive(Debug, Clone)]
286pub struct Sentence {
287    /// Raw sentence text slice (owned).
288    pub text: String,
289    /// Character-based start offset (`str.chars()` index).
290    pub start: usize,
291    /// Character-based end offset (`str.chars()` index).
292    pub end: usize,
293    /// Tokens in this sentence when requested.
294    pub tokens: Option<Vec<Token>>,
295    /// Nested sub-sentences when requested.
296    pub subs: Option<Vec<Sentence>>,
297}