1use std::cell::RefCell;
6use std::path::Path;
7use std::str::FromStr;
8
9use magnus::prelude::*;
10use magnus::{Error, RArray, RHash, Ruby, function, method};
11
12use lindera::mode::Mode;
13use lindera::segmenter::Segmenter;
14use lindera::tokenizer::{Tokenizer, TokenizerBuilder};
15
16use crate::dictionary::{RbDictionary, RbUserDictionary};
17use crate::error::to_magnus_error;
18use crate::token::RbToken;
19use crate::util::rb_hash_to_json;
20
21#[magnus::wrap(class = "Lindera::TokenizerBuilder", free_immediately, size)]
26pub struct RbTokenizerBuilder {
27 inner: RefCell<TokenizerBuilder>,
29}
30
31impl RbTokenizerBuilder {
32 fn new() -> Result<Self, Error> {
38 let ruby = Ruby::get().expect("Ruby runtime not initialized");
39 let inner = TokenizerBuilder::new().map_err(|err| {
40 to_magnus_error(&ruby, format!("Failed to create TokenizerBuilder: {err}"))
41 })?;
42 Ok(Self {
43 inner: RefCell::new(inner),
44 })
45 }
46
47 fn from_file(file_path: String) -> Result<Self, Error> {
57 let ruby = Ruby::get().expect("Ruby runtime not initialized");
58 let inner = TokenizerBuilder::from_file(Path::new(&file_path)).map_err(|err| {
59 to_magnus_error(&ruby, format!("Failed to load config from file: {err}"))
60 })?;
61 Ok(Self {
62 inner: RefCell::new(inner),
63 })
64 }
65
66 fn set_mode(&self, mode: String) -> Result<(), Error> {
72 let ruby = Ruby::get().expect("Ruby runtime not initialized");
73 let m = Mode::from_str(&mode)
74 .map_err(|err| to_magnus_error(&ruby, format!("Failed to create mode: {err}")))?;
75 self.inner.borrow_mut().set_segmenter_mode(&m);
76 Ok(())
77 }
78
79 fn set_dictionary(&self, path: String) {
85 self.inner.borrow_mut().set_segmenter_dictionary(&path);
86 }
87
88 fn set_user_dictionary(&self, uri: String) {
94 self.inner.borrow_mut().set_segmenter_user_dictionary(&uri);
95 }
96
97 fn set_keep_whitespace(&self, keep_whitespace: bool) {
103 self.inner
104 .borrow_mut()
105 .set_segmenter_keep_whitespace(keep_whitespace);
106 }
107
108 fn append_character_filter(&self, kind: String, args: Option<RHash>) -> Result<(), Error> {
115 let ruby = Ruby::get().expect("Ruby runtime not initialized");
116 let filter_args = if let Some(hash) = args {
117 rb_hash_to_json(&ruby, hash)?
118 } else {
119 serde_json::Value::Object(serde_json::Map::new())
120 };
121 self.inner
122 .borrow_mut()
123 .append_character_filter(&kind, &filter_args);
124 Ok(())
125 }
126
127 fn append_token_filter(&self, kind: String, args: Option<RHash>) -> Result<(), Error> {
134 let ruby = Ruby::get().expect("Ruby runtime not initialized");
135 let filter_args = if let Some(hash) = args {
136 rb_hash_to_json(&ruby, hash)?
137 } else {
138 serde_json::Value::Object(serde_json::Map::new())
139 };
140 self.inner
141 .borrow_mut()
142 .append_token_filter(&kind, &filter_args);
143 Ok(())
144 }
145
146 fn build(&self) -> Result<RbTokenizer, Error> {
152 let ruby = Ruby::get().expect("Ruby runtime not initialized");
153 let tokenizer =
154 self.inner.borrow().build().map_err(|err| {
155 to_magnus_error(&ruby, format!("Failed to build tokenizer: {err}"))
156 })?;
157 Ok(RbTokenizer { inner: tokenizer })
158 }
159}
160
161#[magnus::wrap(class = "Lindera::Tokenizer", free_immediately, size)]
165pub struct RbTokenizer {
166 inner: Tokenizer,
168}
169
170fn tokenizer_new(
182 dictionary: &RbDictionary,
183 mode: Option<String>,
184 user_dictionary: Option<&RbUserDictionary>,
185) -> Result<RbTokenizer, Error> {
186 let ruby = Ruby::get().expect("Ruby runtime not initialized");
187 let mode_str = mode.as_deref().unwrap_or("normal");
188 let m = Mode::from_str(mode_str)
189 .map_err(|err| to_magnus_error(&ruby, format!("Failed to create mode: {err}")))?;
190
191 let dict = dictionary.inner.clone();
192 let user_dict = user_dictionary.map(|d| d.inner.clone());
193
194 let segmenter = Segmenter::new(m, dict, user_dict);
195 let tokenizer = Tokenizer::new(segmenter);
196
197 Ok(RbTokenizer { inner: tokenizer })
198}
199
200impl RbTokenizer {
201 fn tokenize(&self, text: String) -> Result<RArray, Error> {
211 let ruby = Ruby::get().expect("Ruby runtime not initialized");
212 let tokens = self
213 .inner
214 .tokenize(&text)
215 .map_err(|err| to_magnus_error(&ruby, format!("Failed to tokenize text: {err}")))?;
216
217 let rb_tokens: Vec<RbToken> = tokens.into_iter().map(RbToken::from_token).collect();
218 let arr = ruby.ary_new_capa(rb_tokens.len());
219 for token in rb_tokens {
220 arr.push(ruby.into_value(token))?;
221 }
222 Ok(arr)
223 }
224
225 fn tokenize_nbest(
238 &self,
239 text: String,
240 n: usize,
241 unique: Option<bool>,
242 cost_threshold: Option<i64>,
243 ) -> Result<RArray, Error> {
244 let ruby = Ruby::get().expect("Ruby runtime not initialized");
245 let results = self
246 .inner
247 .tokenize_nbest(&text, n, unique.unwrap_or(false), cost_threshold)
248 .map_err(|err| {
249 to_magnus_error(&ruby, format!("Failed to tokenize_nbest text: {err}"))
250 })?;
251
252 let rb_results = ruby.ary_new_capa(results.len());
253 for (tokens, cost) in results {
254 let rb_tokens: Vec<RbToken> = tokens.into_iter().map(RbToken::from_token).collect();
255 let token_arr = ruby.ary_new_capa(rb_tokens.len());
256 for token in rb_tokens {
257 token_arr.push(ruby.into_value(token))?;
258 }
259 let pair = ruby.ary_new_capa(2);
260 pair.push(token_arr)?;
261 pair.push(cost)?;
262 rb_results.push(pair)?;
263 }
264
265 Ok(rb_results)
266 }
267}
268
269pub fn define(ruby: &Ruby, module: &magnus::RModule) -> Result<(), Error> {
280 let builder_class = module.define_class("TokenizerBuilder", ruby.class_object())?;
281 builder_class.define_singleton_method("new", function!(RbTokenizerBuilder::new, 0))?;
282 builder_class
283 .define_singleton_method("from_file", function!(RbTokenizerBuilder::from_file, 1))?;
284 builder_class.define_method("set_mode", method!(RbTokenizerBuilder::set_mode, 1))?;
285 builder_class.define_method(
286 "set_dictionary",
287 method!(RbTokenizerBuilder::set_dictionary, 1),
288 )?;
289 builder_class.define_method(
290 "set_user_dictionary",
291 method!(RbTokenizerBuilder::set_user_dictionary, 1),
292 )?;
293 builder_class.define_method(
294 "set_keep_whitespace",
295 method!(RbTokenizerBuilder::set_keep_whitespace, 1),
296 )?;
297 builder_class.define_method(
298 "append_character_filter",
299 method!(RbTokenizerBuilder::append_character_filter, 2),
300 )?;
301 builder_class.define_method(
302 "append_token_filter",
303 method!(RbTokenizerBuilder::append_token_filter, 2),
304 )?;
305 builder_class.define_method("build", method!(RbTokenizerBuilder::build, 0))?;
306
307 let tokenizer_class = module.define_class("Tokenizer", ruby.class_object())?;
308 tokenizer_class.define_singleton_method("new", function!(tokenizer_new, 3))?;
309 tokenizer_class.define_method("tokenize", method!(RbTokenizer::tokenize, 1))?;
310 tokenizer_class.define_method("tokenize_nbest", method!(RbTokenizer::tokenize_nbest, 4))?;
311
312 Ok(())
313}