Skip to main content

lindera_nodejs/
tokenizer.rs

1//! Tokenizer implementation for morphological analysis.
2//!
3//! This module provides a builder pattern for creating tokenizers and the tokenizer itself.
4
5use std::path::Path;
6use std::str::FromStr;
7
8use lindera::mode::Mode;
9use lindera::segmenter::Segmenter;
10use lindera::tokenizer::{Tokenizer, TokenizerBuilder};
11
12use crate::dictionary::{JsDictionary, JsUserDictionary};
13use crate::error::to_napi_error;
14use crate::token::{JsNbestResult, JsToken};
15use crate::util::js_value_to_serde_value;
16
17/// Builder for creating a Tokenizer with custom configuration.
18///
19/// The builder pattern allows for fluent configuration of tokenizer parameters including
20/// dictionaries, modes, and filter pipelines.
21#[napi(js_name = "TokenizerBuilder")]
22pub struct JsTokenizerBuilder {
23    inner: TokenizerBuilder,
24}
25
26#[napi]
27impl JsTokenizerBuilder {
28    /// Creates a new TokenizerBuilder with default configuration.
29    #[napi(constructor)]
30    pub fn new() -> napi::Result<Self> {
31        let inner = TokenizerBuilder::new()
32            .map_err(|err| to_napi_error(format!("Failed to create TokenizerBuilder: {err}")))?;
33
34        Ok(Self { inner })
35    }
36
37    /// Loads configuration from a JSON file.
38    ///
39    /// # Arguments
40    ///
41    /// * `file_path` - Path to the configuration file.
42    ///
43    /// # Returns
44    ///
45    /// A new TokenizerBuilder with the loaded configuration.
46    #[napi]
47    pub fn from_file(&self, file_path: String) -> napi::Result<JsTokenizerBuilder> {
48        let inner = TokenizerBuilder::from_file(Path::new(&file_path))
49            .map_err(|err| to_napi_error(format!("Failed to load config from file: {err}")))?;
50
51        Ok(JsTokenizerBuilder { inner })
52    }
53
54    /// Sets the tokenization mode.
55    ///
56    /// # Arguments
57    ///
58    /// * `mode` - Mode string ("normal" or "decompose").
59    #[napi]
60    pub fn set_mode(&mut self, mode: String) -> napi::Result<()> {
61        let m = Mode::from_str(&mode)
62            .map_err(|err| to_napi_error(format!("Failed to create mode: {err}")))?;
63
64        self.inner.set_segmenter_mode(&m);
65        Ok(())
66    }
67
68    /// Sets the dictionary path or URI.
69    ///
70    /// # Arguments
71    ///
72    /// * `path` - Path to the dictionary directory or embedded URI (e.g. "embedded://ipadic").
73    #[napi]
74    pub fn set_dictionary(&mut self, path: String) {
75        self.inner.set_segmenter_dictionary(&path);
76    }
77
78    /// Sets the user dictionary URI.
79    ///
80    /// # Arguments
81    ///
82    /// * `uri` - URI to the user dictionary.
83    #[napi]
84    pub fn set_user_dictionary(&mut self, uri: String) {
85        self.inner.set_segmenter_user_dictionary(&uri);
86    }
87
88    /// Sets whether to keep whitespace in tokenization results.
89    ///
90    /// # Arguments
91    ///
92    /// * `keep_whitespace` - If true, whitespace tokens will be included in results.
93    #[napi]
94    pub fn set_keep_whitespace(&mut self, keep_whitespace: bool) {
95        self.inner.set_segmenter_keep_whitespace(keep_whitespace);
96    }
97
98    /// Appends a character filter to the preprocessing pipeline.
99    ///
100    /// # Arguments
101    ///
102    /// * `kind` - Type of character filter to add (e.g. "unicode_normalize", "mapping").
103    /// * `args` - Optional filter arguments as a JSON-compatible object.
104    #[napi]
105    pub fn append_character_filter(
106        &mut self,
107        kind: String,
108        args: Option<serde_json::Value>,
109    ) -> napi::Result<()> {
110        let filter_args = js_value_to_serde_value(args);
111        self.inner.append_character_filter(&kind, &filter_args);
112        Ok(())
113    }
114
115    /// Appends a token filter to the postprocessing pipeline.
116    ///
117    /// # Arguments
118    ///
119    /// * `kind` - Type of token filter to add (e.g. "lowercase", "japanese_stop_tags").
120    /// * `args` - Optional filter arguments as a JSON-compatible object.
121    #[napi]
122    pub fn append_token_filter(
123        &mut self,
124        kind: String,
125        args: Option<serde_json::Value>,
126    ) -> napi::Result<()> {
127        let filter_args = js_value_to_serde_value(args);
128        self.inner.append_token_filter(&kind, &filter_args);
129        Ok(())
130    }
131
132    /// Builds the tokenizer with the configured settings.
133    ///
134    /// # Returns
135    ///
136    /// A configured Tokenizer instance ready for use.
137    #[napi]
138    pub fn build(&self) -> napi::Result<JsTokenizer> {
139        let tokenizer = self
140            .inner
141            .build()
142            .map_err(|err| to_napi_error(format!("Failed to build tokenizer: {err}")))?;
143
144        Ok(JsTokenizer { inner: tokenizer })
145    }
146}
147
148/// Tokenizer for performing morphological analysis.
149///
150/// The tokenizer processes text and returns tokens with their morphological features.
151#[napi(js_name = "Tokenizer")]
152pub struct JsTokenizer {
153    inner: Tokenizer,
154}
155
156#[napi]
157impl JsTokenizer {
158    /// Creates a new tokenizer with the given dictionary and mode.
159    ///
160    /// # Arguments
161    ///
162    /// * `dictionary` - Dictionary to use for tokenization.
163    /// * `mode` - Tokenization mode ("normal" or "decompose"). Default: "normal".
164    /// * `user_dictionary` - Optional user dictionary for custom words.
165    #[napi(constructor)]
166    pub fn new(
167        dictionary: &JsDictionary,
168        mode: Option<String>,
169        user_dictionary: Option<&JsUserDictionary>,
170    ) -> napi::Result<Self> {
171        let mode_str = mode.unwrap_or_else(|| "normal".to_string());
172        let m = Mode::from_str(&mode_str)
173            .map_err(|err| to_napi_error(format!("Failed to create mode: {err}")))?;
174
175        let dict = dictionary.inner.clone();
176        let user_dict = user_dictionary.map(|d| d.inner.clone());
177
178        let segmenter = Segmenter::new(m, dict, user_dict);
179        let tokenizer = Tokenizer::new(segmenter);
180
181        Ok(Self { inner: tokenizer })
182    }
183
184    /// Tokenizes the given text.
185    ///
186    /// # Arguments
187    ///
188    /// * `text` - Text to tokenize.
189    ///
190    /// # Returns
191    ///
192    /// An array of Token objects containing morphological features.
193    #[napi]
194    pub fn tokenize(&self, text: String) -> napi::Result<Vec<JsToken>> {
195        let tokens = self
196            .inner
197            .tokenize(&text)
198            .map_err(|err| to_napi_error(format!("Failed to tokenize text: {err}")))?;
199
200        let js_tokens: Vec<JsToken> = tokens.into_iter().map(JsToken::from_token).collect();
201
202        Ok(js_tokens)
203    }
204
205    /// Tokenizes the given text and returns N-best results.
206    ///
207    /// # Arguments
208    ///
209    /// * `text` - Text to tokenize.
210    /// * `n` - Number of N-best results to return.
211    /// * `unique` - If true, deduplicate results (default: false).
212    /// * `cost_threshold` - Maximum cost difference from the best path (default: undefined).
213    ///
214    /// # Returns
215    ///
216    /// An array of NbestResult objects, each containing tokens and their cost.
217    #[napi]
218    pub fn tokenize_nbest(
219        &self,
220        text: String,
221        n: u32,
222        unique: Option<bool>,
223        cost_threshold: Option<i64>,
224    ) -> napi::Result<Vec<JsNbestResult>> {
225        let results = self
226            .inner
227            .tokenize_nbest(&text, n as usize, unique.unwrap_or(false), cost_threshold)
228            .map_err(|err| to_napi_error(format!("Failed to tokenize_nbest text: {err}")))?;
229
230        let js_results: Vec<JsNbestResult> = results
231            .into_iter()
232            .map(|(tokens, cost)| {
233                JsNbestResult::new(tokens.into_iter().map(JsToken::from_token).collect(), cost)
234            })
235            .collect();
236
237        Ok(js_results)
238    }
239}