Skip to main content

kiwi_rs/
model.rs

1use crate::native::{KiwiGlobalConfigRaw, KiwiMorphemeRaw, KiwiTokenInfoRaw};
2
3/// Pre-analyzed token element passed to
4/// [`crate::KiwiBuilder::add_pre_analyzed_word`].
5///
6/// `begin`/`end` are character offsets in the given surface form
7/// (Rust `str.chars()` index space, not byte offsets).
8#[derive(Debug, Clone)]
9pub struct PreAnalyzedToken {
10    /// Surface form.
11    pub form: String,
12    /// Part-of-speech tag.
13    pub tag: String,
14    /// Optional begin character offset.
15    pub begin: Option<usize>,
16    /// Optional end character offset.
17    pub end: Option<usize>,
18}
19
20impl PreAnalyzedToken {
21    /// Creates a token with only `form` and `tag`.
22    pub fn new(form: impl Into<String>, tag: impl Into<String>) -> Self {
23        Self {
24            form: form.into(),
25            tag: tag.into(),
26            begin: None,
27            end: None,
28        }
29    }
30
31    /// Sets explicit span offsets.
32    pub fn with_span(mut self, begin: usize, end: usize) -> Self {
33        self.begin = Some(begin);
34        self.end = Some(end);
35        self
36    }
37}
38
39/// Begin/end boundary for a sentence in character offsets.
40///
41/// Offsets are based on Rust `str.chars()` indexing.
42#[derive(Debug, Clone, Copy)]
43pub struct SentenceBoundary {
44    /// Inclusive begin offset.
45    pub begin: usize,
46    /// Exclusive end offset.
47    pub end: usize,
48}
49
50/// `(id, score)` pair returned by similarity and prediction APIs.
51#[derive(Debug, Clone, Copy)]
52pub struct SimilarityPair {
53    /// Identifier of a morpheme or context.
54    pub id: u32,
55    /// Similarity or prediction score.
56    pub score: f32,
57}
58
59/// Low-level token metadata returned by Kiwi C API.
60///
61/// Position-like fields (`chr_position`, `word_position`, `sent_position`) use
62/// Kiwi's character/token indexing semantics.
63#[derive(Debug, Clone, Copy)]
64pub struct TokenInfo {
65    /// Character position.
66    pub chr_position: u32,
67    /// Word position.
68    pub word_position: u32,
69    /// Sentence position.
70    pub sent_position: u32,
71    /// Line number.
72    pub line_number: u32,
73    /// Token length.
74    pub length: u16,
75    /// Numeric tag id.
76    pub tag: u8,
77    /// Sense id or script id.
78    pub sense_or_script: u8,
79    /// Token score.
80    pub score: f32,
81    /// Typo cost.
82    pub typo_cost: f32,
83    /// Typo form id.
84    pub typo_form_id: u32,
85    /// Paired token id.
86    pub paired_token: u32,
87    /// Sub-sentence position.
88    pub sub_sent_position: u32,
89    /// Dialect id.
90    pub dialect: u16,
91}
92
93impl From<KiwiTokenInfoRaw> for TokenInfo {
94    fn from(value: KiwiTokenInfoRaw) -> Self {
95        Self {
96            chr_position: value.chr_position,
97            word_position: value.word_position,
98            sent_position: value.sent_position,
99            line_number: value.line_number,
100            length: value.length,
101            tag: value.tag,
102            sense_or_script: value.sense_or_script,
103            score: value.score,
104            typo_cost: value.typo_cost,
105            typo_form_id: value.typo_form_id,
106            paired_token: value.paired_token,
107            sub_sent_position: value.sub_sent_position,
108            dialect: value.dialect,
109        }
110    }
111}
112
113/// Candidate extracted word from `extract_words*` builder APIs.
114#[derive(Debug, Clone)]
115pub struct ExtractedWord {
116    /// Surface form.
117    pub form: String,
118    /// Extraction score.
119    pub score: f32,
120    /// Observed frequency.
121    pub frequency: i32,
122    /// POS-specific score from Kiwi.
123    pub pos_score: f32,
124}
125
126/// Morpheme metadata from dictionary lookup APIs.
127#[derive(Debug, Clone, Copy)]
128pub struct MorphemeInfo {
129    /// Numeric tag id.
130    pub tag: u8,
131    /// Sense id.
132    pub sense_id: u8,
133    /// User dictionary score.
134    pub user_score: f32,
135    /// Language-model morpheme id.
136    pub lm_morpheme_id: u32,
137    /// Original morpheme id.
138    pub orig_morpheme_id: u32,
139    /// Dialect id.
140    pub dialect: u16,
141}
142
143impl From<KiwiMorphemeRaw> for MorphemeInfo {
144    fn from(value: KiwiMorphemeRaw) -> Self {
145        Self {
146            tag: value.tag,
147            sense_id: value.sense_id,
148            user_score: value.user_score,
149            lm_morpheme_id: value.lm_morpheme_id,
150            orig_morpheme_id: value.orig_morpheme_id,
151            dialect: value.dialect,
152        }
153    }
154}
155
156/// Morpheme information with resolved string fields.
157#[derive(Debug, Clone)]
158pub struct MorphemeSense {
159    /// Morpheme id.
160    pub morph_id: u32,
161    /// Morpheme form.
162    pub form: String,
163    /// Morpheme tag.
164    pub tag: String,
165    /// Sense id.
166    pub sense_id: u8,
167    /// Dialect id.
168    pub dialect: u16,
169}
170
171/// Global runtime parameters for Kiwi inference behavior.
172#[derive(Debug, Clone, Copy)]
173pub struct GlobalConfig {
174    /// Whether to integrate allomorph variants.
175    pub integrate_allomorph: bool,
176    /// Candidate cut-off threshold.
177    pub cut_off_threshold: f32,
178    /// Scale applied to unknown-form score.
179    pub unk_form_score_scale: f32,
180    /// Bias applied to unknown-form score.
181    pub unk_form_score_bias: f32,
182    /// Penalty for spacing decisions.
183    pub space_penalty: f32,
184    /// Weight applied to typo costs.
185    pub typo_cost_weight: f32,
186    /// Maximum unknown token length.
187    pub max_unk_form_size: u32,
188    /// Allowed whitespace tolerance during analysis.
189    pub space_tolerance: u32,
190}
191
192impl Default for GlobalConfig {
193    fn default() -> Self {
194        KiwiGlobalConfigRaw::default().into()
195    }
196}
197
198impl From<KiwiGlobalConfigRaw> for GlobalConfig {
199    fn from(value: KiwiGlobalConfigRaw) -> Self {
200        Self {
201            integrate_allomorph: value.integrate_allomorph != 0,
202            cut_off_threshold: value.cut_off_threshold,
203            unk_form_score_scale: value.unk_form_score_scale,
204            unk_form_score_bias: value.unk_form_score_bias,
205            space_penalty: value.space_penalty,
206            typo_cost_weight: value.typo_cost_weight,
207            max_unk_form_size: value.max_unk_form_size,
208            space_tolerance: value.space_tolerance,
209        }
210    }
211}
212
213impl From<GlobalConfig> for KiwiGlobalConfigRaw {
214    fn from(value: GlobalConfig) -> Self {
215        Self {
216            integrate_allomorph: if value.integrate_allomorph { 1 } else { 0 },
217            cut_off_threshold: value.cut_off_threshold,
218            unk_form_score_scale: value.unk_form_score_scale,
219            unk_form_score_bias: value.unk_form_score_bias,
220            space_penalty: value.space_penalty,
221            typo_cost_weight: value.typo_cost_weight,
222            max_unk_form_size: value.max_unk_form_size,
223            space_tolerance: value.space_tolerance,
224        }
225    }
226}