lindera_nodejs/
tokenizer.rs1use std::path::Path;
6use std::str::FromStr;
7
8use lindera::mode::Mode;
9use lindera::segmenter::Segmenter;
10use lindera::tokenizer::{Tokenizer, TokenizerBuilder};
11
12use crate::dictionary::{JsDictionary, JsUserDictionary};
13use crate::error::to_napi_error;
14use crate::token::{JsNbestResult, JsToken};
15use crate::util::js_value_to_serde_value;
16
17#[napi(js_name = "TokenizerBuilder")]
22pub struct JsTokenizerBuilder {
23 inner: TokenizerBuilder,
24}
25
26#[napi]
27impl JsTokenizerBuilder {
28 #[napi(constructor)]
30 pub fn new() -> napi::Result<Self> {
31 let inner = TokenizerBuilder::new()
32 .map_err(|err| to_napi_error(format!("Failed to create TokenizerBuilder: {err}")))?;
33
34 Ok(Self { inner })
35 }
36
37 #[napi]
47 pub fn from_file(&self, file_path: String) -> napi::Result<JsTokenizerBuilder> {
48 let inner = TokenizerBuilder::from_file(Path::new(&file_path))
49 .map_err(|err| to_napi_error(format!("Failed to load config from file: {err}")))?;
50
51 Ok(JsTokenizerBuilder { inner })
52 }
53
54 #[napi]
60 pub fn set_mode(&mut self, mode: String) -> napi::Result<()> {
61 let m = Mode::from_str(&mode)
62 .map_err(|err| to_napi_error(format!("Failed to create mode: {err}")))?;
63
64 self.inner.set_segmenter_mode(&m);
65 Ok(())
66 }
67
68 #[napi]
74 pub fn set_dictionary(&mut self, path: String) {
75 self.inner.set_segmenter_dictionary(&path);
76 }
77
78 #[napi]
84 pub fn set_user_dictionary(&mut self, uri: String) {
85 self.inner.set_segmenter_user_dictionary(&uri);
86 }
87
88 #[napi]
94 pub fn set_keep_whitespace(&mut self, keep_whitespace: bool) {
95 self.inner.set_segmenter_keep_whitespace(keep_whitespace);
96 }
97
98 #[napi]
105 pub fn append_character_filter(
106 &mut self,
107 kind: String,
108 args: Option<serde_json::Value>,
109 ) -> napi::Result<()> {
110 let filter_args = js_value_to_serde_value(args);
111 self.inner.append_character_filter(&kind, &filter_args);
112 Ok(())
113 }
114
115 #[napi]
122 pub fn append_token_filter(
123 &mut self,
124 kind: String,
125 args: Option<serde_json::Value>,
126 ) -> napi::Result<()> {
127 let filter_args = js_value_to_serde_value(args);
128 self.inner.append_token_filter(&kind, &filter_args);
129 Ok(())
130 }
131
132 #[napi]
138 pub fn build(&self) -> napi::Result<JsTokenizer> {
139 let tokenizer = self
140 .inner
141 .build()
142 .map_err(|err| to_napi_error(format!("Failed to build tokenizer: {err}")))?;
143
144 Ok(JsTokenizer { inner: tokenizer })
145 }
146}
147
148#[napi(js_name = "Tokenizer")]
152pub struct JsTokenizer {
153 inner: Tokenizer,
154}
155
156#[napi]
157impl JsTokenizer {
158 #[napi(constructor)]
166 pub fn new(
167 dictionary: &JsDictionary,
168 mode: Option<String>,
169 user_dictionary: Option<&JsUserDictionary>,
170 ) -> napi::Result<Self> {
171 let mode_str = mode.unwrap_or_else(|| "normal".to_string());
172 let m = Mode::from_str(&mode_str)
173 .map_err(|err| to_napi_error(format!("Failed to create mode: {err}")))?;
174
175 let dict = dictionary.inner.clone();
176 let user_dict = user_dictionary.map(|d| d.inner.clone());
177
178 let segmenter = Segmenter::new(m, dict, user_dict);
179 let tokenizer = Tokenizer::new(segmenter);
180
181 Ok(Self { inner: tokenizer })
182 }
183
184 #[napi]
194 pub fn tokenize(&self, text: String) -> napi::Result<Vec<JsToken>> {
195 let tokens = self
196 .inner
197 .tokenize(&text)
198 .map_err(|err| to_napi_error(format!("Failed to tokenize text: {err}")))?;
199
200 let js_tokens: Vec<JsToken> = tokens.into_iter().map(JsToken::from_token).collect();
201
202 Ok(js_tokens)
203 }
204
205 #[napi]
218 pub fn tokenize_nbest(
219 &self,
220 text: String,
221 n: u32,
222 unique: Option<bool>,
223 cost_threshold: Option<i64>,
224 ) -> napi::Result<Vec<JsNbestResult>> {
225 let results = self
226 .inner
227 .tokenize_nbest(&text, n as usize, unique.unwrap_or(false), cost_threshold)
228 .map_err(|err| to_napi_error(format!("Failed to tokenize_nbest text: {err}")))?;
229
230 let js_results: Vec<JsNbestResult> = results
231 .into_iter()
232 .map(|(tokens, cost)| {
233 JsNbestResult::new(tokens.into_iter().map(JsToken::from_token).collect(), cost)
234 })
235 .collect();
236
237 Ok(js_results)
238 }
239}