use std::str::FromStr;
use serde_json::Value;
use wasm_bindgen::prelude::*;
use lindera::mode::Mode;
use lindera::segmenter::Segmenter;
use lindera::tokenizer::{
Tokenizer as LinderaTokenizer, TokenizerBuilder as LinderaTokenizerBuilder,
};
use crate::dictionary::{JsDictionary, JsUserDictionary};
use crate::token::JsToken;
#[wasm_bindgen]
pub struct TokenizerBuilder {
inner: LinderaTokenizerBuilder,
}
#[wasm_bindgen]
impl TokenizerBuilder {
#[wasm_bindgen(constructor)]
pub fn new() -> Result<Self, JsValue> {
let inner =
LinderaTokenizerBuilder::new().map_err(|e| JsValue::from_str(&e.to_string()))?;
Ok(Self { inner })
}
pub fn build(self) -> Result<Tokenizer, JsValue> {
let inner = self
.inner
.build()
.map_err(|e| JsValue::from_str(&e.to_string()))?;
Ok(Tokenizer { inner })
}
#[wasm_bindgen(js_name = "setMode")]
pub fn set_mode(&mut self, mode: &str) -> Result<(), JsValue> {
let m = Mode::from_str(mode).map_err(|e| JsValue::from_str(&e.to_string()))?;
self.inner.set_segmenter_mode(&m);
Ok(())
}
#[wasm_bindgen(js_name = "setDictionary")]
pub fn set_dictionary(&mut self, uri: &str) -> Result<(), JsValue> {
self.inner.set_segmenter_dictionary(uri);
Ok(())
}
#[wasm_bindgen(js_name = "setUserDictionary")]
pub fn set_user_dictionary(&mut self, uri: &str) -> Result<(), JsValue> {
self.inner.set_segmenter_user_dictionary(uri);
Ok(())
}
#[wasm_bindgen(js_name = "setKeepWhitespace")]
pub fn set_keep_whitespace(&mut self, keep: bool) -> Result<(), JsValue> {
self.inner.set_segmenter_keep_whitespace(keep);
Ok(())
}
#[wasm_bindgen(js_name = "appendCharacterFilter")]
pub fn append_character_filter(&mut self, name: &str, args: JsValue) -> Result<(), JsValue> {
let a = if args.is_undefined() || args.is_null() {
Value::Object(serde_json::Map::new())
} else {
serde_wasm_bindgen::from_value::<Value>(args)
.map_err(|e| JsValue::from_str(&e.to_string()))?
};
self.inner.append_character_filter(name, &a);
Ok(())
}
#[wasm_bindgen(js_name = "appendTokenFilter")]
pub fn append_token_filter(&mut self, name: &str, args: JsValue) -> Result<(), JsValue> {
let a = if args.is_undefined() || args.is_null() {
Value::Object(serde_json::Map::new())
} else {
serde_wasm_bindgen::from_value::<Value>(args)
.map_err(|e| JsValue::from_str(&e.to_string()))?
};
self.inner.append_token_filter(name, &a);
Ok(())
}
#[wasm_bindgen(js_name = "set_mode")]
pub fn py_set_mode(&mut self, mode: &str) -> Result<(), JsValue> {
self.set_mode(mode)
}
#[wasm_bindgen(js_name = "set_dictionary")]
pub fn py_set_dictionary(&mut self, uri: &str) -> Result<(), JsValue> {
self.set_dictionary(uri)
}
#[wasm_bindgen(js_name = "set_user_dictionary")]
pub fn py_set_user_dictionary(&mut self, uri: &str) -> Result<(), JsValue> {
self.set_user_dictionary(uri)
}
#[wasm_bindgen(js_name = "set_keep_whitespace")]
pub fn py_set_keep_whitespace(&mut self, keep: bool) -> Result<(), JsValue> {
self.set_keep_whitespace(keep)
}
#[wasm_bindgen(js_name = "append_character_filter")]
pub fn py_append_character_filter(&mut self, name: &str, args: JsValue) -> Result<(), JsValue> {
self.append_character_filter(name, args)
}
#[wasm_bindgen(js_name = "append_token_filter")]
pub fn py_append_token_filter(&mut self, name: &str, args: JsValue) -> Result<(), JsValue> {
self.append_token_filter(name, args)
}
}
#[wasm_bindgen]
pub struct Tokenizer {
inner: LinderaTokenizer,
}
#[wasm_bindgen]
impl Tokenizer {
#[wasm_bindgen(constructor)]
pub fn new(
dictionary: JsDictionary,
mode: Option<String>,
user_dictionary: Option<JsUserDictionary>,
) -> Result<Tokenizer, JsValue> {
let m = if let Some(mode_str) = mode {
Mode::from_str(&mode_str).map_err(|e| JsValue::from_str(&e.to_string()))?
} else {
Mode::Normal
};
let dict = dictionary.inner;
let user_dict = user_dictionary.map(|d| d.inner);
let segmenter = Segmenter::new(m, dict, user_dict);
let inner = LinderaTokenizer::new(segmenter);
Ok(Tokenizer { inner })
}
pub fn tokenize(&self, input_text: &str) -> Result<Vec<JsToken>, JsValue> {
let tokens = self
.inner
.tokenize(input_text)
.map_err(|e| JsValue::from_str(&e.to_string()))?;
let js_tokens = tokens.into_iter().map(JsToken::from_token).collect();
Ok(js_tokens)
}
#[wasm_bindgen(js_name = "tokenizeNbest")]
pub fn tokenize_nbest(
&self,
input_text: &str,
n: usize,
unique: Option<bool>,
cost_threshold: Option<i64>,
) -> Result<JsValue, JsValue> {
let results = self
.inner
.tokenize_nbest(input_text, n, unique.unwrap_or(false), cost_threshold)
.map_err(|e| JsValue::from_str(&e.to_string()))?;
let outer = js_sys::Array::new();
for (tokens, cost) in results {
let entry = js_sys::Object::new();
let inner = js_sys::Array::new();
for token in tokens {
let js_token = JsToken::from_token(token);
inner.push(&js_token.to_json());
}
js_sys::Reflect::set(&entry, &"tokens".into(), &inner).unwrap();
js_sys::Reflect::set(&entry, &"cost".into(), &JsValue::from(cost as f64)).unwrap();
outer.push(&entry);
}
Ok(outer.into())
}
#[wasm_bindgen(js_name = "tokenize_nbest")]
pub fn py_tokenize_nbest(
&self,
input_text: &str,
n: usize,
unique: Option<bool>,
cost_threshold: Option<i64>,
) -> Result<JsValue, JsValue> {
self.tokenize_nbest(input_text, n, unique, cost_threshold)
}
}
#[cfg(test)]
mod tests {
#[cfg(target_arch = "wasm32")]
use wasm_bindgen_test::wasm_bindgen_test;
#[cfg(target_arch = "wasm32")]
#[wasm_bindgen_test]
fn test_tokenize() {
use crate::TokenizerBuilder;
let mut builder = TokenizerBuilder::new().unwrap();
builder.set_mode("normal").unwrap();
builder.set_dictionary("embedded://ipadic").unwrap();
let tokenizer = builder.build().unwrap();
let tokens = tokenizer.tokenize("関西国際空港限定トートバッグ").unwrap();
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0].surface, "関西国際空港");
assert_eq!(tokens[1].surface, "限定");
assert_eq!(tokens[2].surface, "トートバッグ");
assert_eq!(tokens[0].get_detail(0), Some("名詞".to_string()));
}
}