lindera_wasm/
lib.rs

1use std::path::Path;
2use std::str::FromStr;
3
4use serde_json::Value;
5use wasm_bindgen::prelude::*;
6
7use lindera::dictionary::DictionaryKind;
8use lindera::mode::Mode;
9use lindera::token::Token;
10use lindera::tokenizer::{
11    Tokenizer as LinderaTokenizer, TokenizerBuilder as LinderaTokenizerBuilder,
12};
13
14const VERSION: &str = env!("CARGO_PKG_VERSION");
15
16#[wasm_bindgen(js_name = "getVersion")]
17pub fn get_version() -> String {
18    VERSION.to_string()
19}
20
21fn token_to_json(token: &mut Token) -> Value {
22    serde_json::json!({
23        "text": token.text,
24        "details": token.details().clone(),
25        "byte_start": token.byte_start,
26        "byte_end": token.byte_end,
27        "word_id": token.word_id,
28    })
29}
30
31#[wasm_bindgen]
32pub struct TokenizerBuilder {
33    inner: LinderaTokenizerBuilder,
34}
35
36#[wasm_bindgen]
37impl TokenizerBuilder {
38    #[wasm_bindgen(constructor)]
39    pub fn new() -> Result<Self, JsValue> {
40        let inner =
41            LinderaTokenizerBuilder::new().map_err(|e| JsValue::from_str(&e.to_string()))?;
42
43        Ok(Self { inner })
44    }
45
46    pub fn build(self) -> Result<Tokenizer, JsValue> {
47        let inner = self
48            .inner
49            .build()
50            .map_err(|e| JsValue::from_str(&e.to_string()))?;
51
52        Ok(Tokenizer { inner })
53    }
54
55    #[wasm_bindgen(js_name = "setMode")]
56    pub fn set_mode(&mut self, mode: &str) -> Result<(), JsValue> {
57        let m = Mode::from_str(mode).map_err(|e| JsValue::from_str(&e.to_string()))?;
58        self.inner.set_segmenter_mode(&m);
59
60        Ok(())
61    }
62
63    #[wasm_bindgen(js_name = "setDictionaryKind")]
64    pub fn set_dictionary_kind(&mut self, kind: &str) -> Result<(), JsValue> {
65        let k = DictionaryKind::from_str(kind).map_err(|e| JsValue::from_str(&e.to_string()))?;
66        self.inner.set_segmenter_dictionary_kind(&k);
67
68        Ok(())
69    }
70
71    #[wasm_bindgen(js_name = "setDictionaryPath")]
72    pub fn set_dictionary_path(&mut self, path: &str) -> Result<(), JsValue> {
73        self.inner.set_segmenter_dictionary_path(Path::new(path));
74
75        Ok(())
76    }
77
78    #[wasm_bindgen(js_name = "setUserDictionaryPath")]
79    pub fn set_user_dictionary_path(&mut self, path: &str) -> Result<(), JsValue> {
80        self.inner
81            .set_segmenter_user_dictionary_path(Path::new(path));
82
83        Ok(())
84    }
85
86    #[wasm_bindgen(js_name = "setUserDictionaryKind")]
87    pub fn set_user_dictionary_kind(&mut self, kind: &str) -> Result<(), JsValue> {
88        let k = DictionaryKind::from_str(kind).map_err(|e| JsValue::from_str(&e.to_string()))?;
89        self.inner.set_segmenter_user_dictionary_kind(&k);
90
91        Ok(())
92    }
93
94    #[wasm_bindgen(js_name = "appendCharacterFilter")]
95    pub fn append_character_filter(&mut self, name: &str, args: JsValue) -> Result<(), JsValue> {
96        let a = serde_wasm_bindgen::from_value::<Value>(args)
97            .map_err(|e| JsValue::from_str(&e.to_string()))?;
98
99        self.inner.append_character_filter(name, &a);
100
101        Ok(())
102    }
103
104    #[wasm_bindgen(js_name = "appendTokenFilter")]
105    pub fn append_token_filter(&mut self, name: &str, args: JsValue) -> Result<(), JsValue> {
106        let a = serde_wasm_bindgen::from_value::<Value>(args)
107            .map_err(|e| JsValue::from_str(&e.to_string()))?;
108
109        self.inner.append_token_filter(name, &a);
110
111        Ok(())
112    }
113}
114
115#[wasm_bindgen]
116pub struct Tokenizer {
117    inner: LinderaTokenizer,
118}
119
120#[wasm_bindgen]
121impl Tokenizer {
122    pub fn tokenize(&self, input_text: &str) -> Result<JsValue, JsValue> {
123        let mut tokens = self
124            .inner
125            .tokenize(input_text)
126            .map_err(|e| JsValue::from_str(&e.to_string()))?;
127
128        let js_value = serde_wasm_bindgen::to_value(
129            &tokens
130                .iter_mut()
131                .map(|token| token_to_json(token))
132                .collect::<Vec<_>>(),
133        )
134        .map_err(|e| JsValue::from_str(&e.to_string()))?;
135
136        Ok(js_value)
137    }
138}
139
140#[cfg(test)]
141mod tests {
142    #[cfg(target_arch = "wasm32")]
143    use wasm_bindgen_test::wasm_bindgen_test;
144
145    #[cfg(target_arch = "wasm32")]
146    #[wasm_bindgen_test]
147    fn test_tokenize() {
148        use crate::TokenizerBuilder;
149        use serde_json::Value;
150
151        let mut builder = TokenizerBuilder::new().unwrap();
152        builder.set_mode("normal").unwrap();
153        builder.set_dictionary_kind("ipadic").unwrap();
154
155        let tokenizer = builder.build().unwrap();
156
157        let t = tokenizer.tokenize("関西国際空港限定トートバッグ").unwrap();
158        let tokens: Vec<Value> = serde_wasm_bindgen::from_value(t).unwrap();
159
160        assert_eq!(tokens.len(), 3);
161        assert_eq!(tokens[0].get("text").unwrap(), "関西国際空港");
162        assert_eq!(tokens[1].get("text").unwrap(), "限定");
163        assert_eq!(tokens[2].get("text").unwrap(), "トートバッグ");
164    }
165}