Skip to main content

lindera_ruby/
tokenizer.rs

1//! Tokenizer implementation for morphological analysis.
2//!
3//! This module provides a builder pattern for creating tokenizers and the tokenizer itself.
4
5use std::cell::RefCell;
6use std::path::Path;
7use std::str::FromStr;
8
9use magnus::prelude::*;
10use magnus::{Error, RArray, RHash, Ruby, function, method};
11
12use lindera::mode::Mode;
13use lindera::segmenter::Segmenter;
14use lindera::tokenizer::{Tokenizer, TokenizerBuilder};
15
16use crate::dictionary::{RbDictionary, RbUserDictionary};
17use crate::error::to_magnus_error;
18use crate::token::RbToken;
19use crate::util::rb_hash_to_json;
20
21/// Builder for creating a `Tokenizer` with custom configuration.
22///
23/// The builder pattern allows for fluent configuration of tokenizer parameters.
24/// Uses `RefCell` for interior mutability since Magnus `method!` requires `&self`.
25#[magnus::wrap(class = "Lindera::TokenizerBuilder", free_immediately, size)]
26pub struct RbTokenizerBuilder {
27    /// Inner Lindera tokenizer builder (wrapped in RefCell for interior mutability).
28    inner: RefCell<TokenizerBuilder>,
29}
30
31impl RbTokenizerBuilder {
32    /// Creates a new `RbTokenizerBuilder` with default configuration.
33    ///
34    /// # Returns
35    ///
36    /// A new `RbTokenizerBuilder` instance.
37    fn new() -> Result<Self, Error> {
38        let ruby = Ruby::get().expect("Ruby runtime not initialized");
39        let inner = TokenizerBuilder::new().map_err(|err| {
40            to_magnus_error(&ruby, format!("Failed to create TokenizerBuilder: {err}"))
41        })?;
42        Ok(Self {
43            inner: RefCell::new(inner),
44        })
45    }
46
47    /// Loads configuration from a file.
48    ///
49    /// # Arguments
50    ///
51    /// * `file_path` - Path to the configuration file.
52    ///
53    /// # Returns
54    ///
55    /// A new `RbTokenizerBuilder` with the loaded configuration.
56    fn from_file(file_path: String) -> Result<Self, Error> {
57        let ruby = Ruby::get().expect("Ruby runtime not initialized");
58        let inner = TokenizerBuilder::from_file(Path::new(&file_path)).map_err(|err| {
59            to_magnus_error(&ruby, format!("Failed to load config from file: {err}"))
60        })?;
61        Ok(Self {
62            inner: RefCell::new(inner),
63        })
64    }
65
66    /// Sets the tokenization mode.
67    ///
68    /// # Arguments
69    ///
70    /// * `mode` - Mode string ("normal" or "decompose").
71    fn set_mode(&self, mode: String) -> Result<(), Error> {
72        let ruby = Ruby::get().expect("Ruby runtime not initialized");
73        let m = Mode::from_str(&mode)
74            .map_err(|err| to_magnus_error(&ruby, format!("Failed to create mode: {err}")))?;
75        self.inner.borrow_mut().set_segmenter_mode(&m);
76        Ok(())
77    }
78
79    /// Sets the dictionary path.
80    ///
81    /// # Arguments
82    ///
83    /// * `path` - Path to the dictionary directory.
84    fn set_dictionary(&self, path: String) {
85        self.inner.borrow_mut().set_segmenter_dictionary(&path);
86    }
87
88    /// Sets the user dictionary URI.
89    ///
90    /// # Arguments
91    ///
92    /// * `uri` - URI to the user dictionary.
93    fn set_user_dictionary(&self, uri: String) {
94        self.inner.borrow_mut().set_segmenter_user_dictionary(&uri);
95    }
96
97    /// Sets whether to keep whitespace in tokenization results.
98    ///
99    /// # Arguments
100    ///
101    /// * `keep_whitespace` - If true, whitespace tokens will be included.
102    fn set_keep_whitespace(&self, keep_whitespace: bool) {
103        self.inner
104            .borrow_mut()
105            .set_segmenter_keep_whitespace(keep_whitespace);
106    }
107
108    /// Appends a character filter to the filter pipeline.
109    ///
110    /// # Arguments
111    ///
112    /// * `kind` - Type of character filter to add.
113    /// * `args` - Optional hash of filter arguments.
114    fn append_character_filter(&self, kind: String, args: Option<RHash>) -> Result<(), Error> {
115        let ruby = Ruby::get().expect("Ruby runtime not initialized");
116        let filter_args = if let Some(hash) = args {
117            rb_hash_to_json(&ruby, hash)?
118        } else {
119            serde_json::Value::Object(serde_json::Map::new())
120        };
121        self.inner
122            .borrow_mut()
123            .append_character_filter(&kind, &filter_args);
124        Ok(())
125    }
126
127    /// Appends a token filter to the filter pipeline.
128    ///
129    /// # Arguments
130    ///
131    /// * `kind` - Type of token filter to add.
132    /// * `args` - Optional hash of filter arguments.
133    fn append_token_filter(&self, kind: String, args: Option<RHash>) -> Result<(), Error> {
134        let ruby = Ruby::get().expect("Ruby runtime not initialized");
135        let filter_args = if let Some(hash) = args {
136            rb_hash_to_json(&ruby, hash)?
137        } else {
138            serde_json::Value::Object(serde_json::Map::new())
139        };
140        self.inner
141            .borrow_mut()
142            .append_token_filter(&kind, &filter_args);
143        Ok(())
144    }
145
146    /// Builds the tokenizer with the configured settings.
147    ///
148    /// # Returns
149    ///
150    /// A configured `RbTokenizer` instance.
151    fn build(&self) -> Result<RbTokenizer, Error> {
152        let ruby = Ruby::get().expect("Ruby runtime not initialized");
153        let tokenizer =
154            self.inner.borrow().build().map_err(|err| {
155                to_magnus_error(&ruby, format!("Failed to build tokenizer: {err}"))
156            })?;
157        Ok(RbTokenizer { inner: tokenizer })
158    }
159}
160
161/// Tokenizer for performing morphological analysis.
162///
163/// The tokenizer processes text and returns tokens with their morphological features.
164#[magnus::wrap(class = "Lindera::Tokenizer", free_immediately, size)]
165pub struct RbTokenizer {
166    /// Inner Lindera tokenizer.
167    inner: Tokenizer,
168}
169
170/// Creates a new tokenizer with the given dictionary and mode.
171///
172/// # Arguments
173///
174/// * `dictionary` - Dictionary to use.
175/// * `mode` - Tokenization mode ("normal" or "decompose"). Defaults to "normal".
176/// * `user_dictionary` - Optional user dictionary.
177///
178/// # Returns
179///
180/// A new `RbTokenizer` instance.
181fn tokenizer_new(
182    dictionary: &RbDictionary,
183    mode: Option<String>,
184    user_dictionary: Option<&RbUserDictionary>,
185) -> Result<RbTokenizer, Error> {
186    let ruby = Ruby::get().expect("Ruby runtime not initialized");
187    let mode_str = mode.as_deref().unwrap_or("normal");
188    let m = Mode::from_str(mode_str)
189        .map_err(|err| to_magnus_error(&ruby, format!("Failed to create mode: {err}")))?;
190
191    let dict = dictionary.inner.clone();
192    let user_dict = user_dictionary.map(|d| d.inner.clone());
193
194    let segmenter = Segmenter::new(m, dict, user_dict);
195    let tokenizer = Tokenizer::new(segmenter);
196
197    Ok(RbTokenizer { inner: tokenizer })
198}
199
200impl RbTokenizer {
201    /// Tokenizes the given text.
202    ///
203    /// # Arguments
204    ///
205    /// * `text` - Text to tokenize.
206    ///
207    /// # Returns
208    ///
209    /// An array of Token objects.
210    fn tokenize(&self, text: String) -> Result<RArray, Error> {
211        let ruby = Ruby::get().expect("Ruby runtime not initialized");
212        let tokens = self
213            .inner
214            .tokenize(&text)
215            .map_err(|err| to_magnus_error(&ruby, format!("Failed to tokenize text: {err}")))?;
216
217        let rb_tokens: Vec<RbToken> = tokens.into_iter().map(RbToken::from_token).collect();
218        let arr = ruby.ary_new_capa(rb_tokens.len());
219        for token in rb_tokens {
220            arr.push(ruby.into_value(token))?;
221        }
222        Ok(arr)
223    }
224
225    /// Tokenizes the given text and returns N-best results.
226    ///
227    /// # Arguments
228    ///
229    /// * `text` - Text to tokenize.
230    /// * `n` - Number of N-best results.
231    /// * `unique` - Whether to deduplicate results (default: false).
232    /// * `cost_threshold` - Optional cost threshold.
233    ///
234    /// # Returns
235    ///
236    /// An array of [tokens, cost] pairs.
237    fn tokenize_nbest(
238        &self,
239        text: String,
240        n: usize,
241        unique: Option<bool>,
242        cost_threshold: Option<i64>,
243    ) -> Result<RArray, Error> {
244        let ruby = Ruby::get().expect("Ruby runtime not initialized");
245        let results = self
246            .inner
247            .tokenize_nbest(&text, n, unique.unwrap_or(false), cost_threshold)
248            .map_err(|err| {
249                to_magnus_error(&ruby, format!("Failed to tokenize_nbest text: {err}"))
250            })?;
251
252        let rb_results = ruby.ary_new_capa(results.len());
253        for (tokens, cost) in results {
254            let rb_tokens: Vec<RbToken> = tokens.into_iter().map(RbToken::from_token).collect();
255            let token_arr = ruby.ary_new_capa(rb_tokens.len());
256            for token in rb_tokens {
257                token_arr.push(ruby.into_value(token))?;
258            }
259            let pair = ruby.ary_new_capa(2);
260            pair.push(token_arr)?;
261            pair.push(cost)?;
262            rb_results.push(pair)?;
263        }
264
265        Ok(rb_results)
266    }
267}
268
269/// Defines TokenizerBuilder and Tokenizer classes in the given Ruby module.
270///
271/// # Arguments
272///
273/// * `ruby` - Ruby runtime handle.
274/// * `module` - Parent Ruby module.
275///
276/// # Returns
277///
278/// `Ok(())` on success, or a Magnus `Error` on failure.
279pub fn define(ruby: &Ruby, module: &magnus::RModule) -> Result<(), Error> {
280    let builder_class = module.define_class("TokenizerBuilder", ruby.class_object())?;
281    builder_class.define_singleton_method("new", function!(RbTokenizerBuilder::new, 0))?;
282    builder_class
283        .define_singleton_method("from_file", function!(RbTokenizerBuilder::from_file, 1))?;
284    builder_class.define_method("set_mode", method!(RbTokenizerBuilder::set_mode, 1))?;
285    builder_class.define_method(
286        "set_dictionary",
287        method!(RbTokenizerBuilder::set_dictionary, 1),
288    )?;
289    builder_class.define_method(
290        "set_user_dictionary",
291        method!(RbTokenizerBuilder::set_user_dictionary, 1),
292    )?;
293    builder_class.define_method(
294        "set_keep_whitespace",
295        method!(RbTokenizerBuilder::set_keep_whitespace, 1),
296    )?;
297    builder_class.define_method(
298        "append_character_filter",
299        method!(RbTokenizerBuilder::append_character_filter, 2),
300    )?;
301    builder_class.define_method(
302        "append_token_filter",
303        method!(RbTokenizerBuilder::append_token_filter, 2),
304    )?;
305    builder_class.define_method("build", method!(RbTokenizerBuilder::build, 0))?;
306
307    let tokenizer_class = module.define_class("Tokenizer", ruby.class_object())?;
308    tokenizer_class.define_singleton_method("new", function!(tokenizer_new, 3))?;
309    tokenizer_class.define_method("tokenize", method!(RbTokenizer::tokenize, 1))?;
310    tokenizer_class.define_method("tokenize_nbest", method!(RbTokenizer::tokenize_nbest, 4))?;
311
312    Ok(())
313}