1use std::path::Path;
2use std::str::FromStr;
3
4use serde_json::Value;
5use wasm_bindgen::prelude::*;
6
7use lindera::dictionary::DictionaryKind;
8use lindera::mode::Mode;
9use lindera::token::Token;
10use lindera::tokenizer::{
11 Tokenizer as LinderaTokenizer, TokenizerBuilder as LinderaTokenizerBuilder,
12};
13
14const VERSION: &str = env!("CARGO_PKG_VERSION");
15
16#[wasm_bindgen(js_name = "getVersion")]
17pub fn get_version() -> String {
18 VERSION.to_string()
19}
20
21fn token_to_json(token: &mut Token) -> Value {
22 serde_json::json!({
23 "text": token.text,
24 "details": token.details().clone(),
25 "byte_start": token.byte_start,
26 "byte_end": token.byte_end,
27 "word_id": token.word_id,
28 })
29}
30
31#[wasm_bindgen]
32pub struct TokenizerBuilder {
33 inner: LinderaTokenizerBuilder,
34}
35
36#[wasm_bindgen]
37impl TokenizerBuilder {
38 #[wasm_bindgen(constructor)]
39 pub fn new() -> Result<Self, JsValue> {
40 let inner =
41 LinderaTokenizerBuilder::new().map_err(|e| JsValue::from_str(&e.to_string()))?;
42
43 Ok(Self { inner })
44 }
45
46 pub fn build(self) -> Result<Tokenizer, JsValue> {
47 let inner = self
48 .inner
49 .build()
50 .map_err(|e| JsValue::from_str(&e.to_string()))?;
51
52 Ok(Tokenizer { inner })
53 }
54
55 #[wasm_bindgen(js_name = "setMode")]
56 pub fn set_mode(&mut self, mode: &str) -> Result<(), JsValue> {
57 let m = Mode::from_str(mode).map_err(|e| JsValue::from_str(&e.to_string()))?;
58 self.inner.set_segmenter_mode(&m);
59
60 Ok(())
61 }
62
63 #[wasm_bindgen(js_name = "setDictionaryKind")]
64 pub fn set_dictionary_kind(&mut self, kind: &str) -> Result<(), JsValue> {
65 let k = DictionaryKind::from_str(kind).map_err(|e| JsValue::from_str(&e.to_string()))?;
66 self.inner.set_segmenter_dictionary_kind(&k);
67
68 Ok(())
69 }
70
71 #[wasm_bindgen(js_name = "setDictionaryPath")]
72 pub fn set_dictionary_path(&mut self, path: &str) -> Result<(), JsValue> {
73 self.inner.set_segmenter_dictionary_path(Path::new(path));
74
75 Ok(())
76 }
77
78 #[wasm_bindgen(js_name = "setUserDictionaryPath")]
79 pub fn set_user_dictionary_path(&mut self, path: &str) -> Result<(), JsValue> {
80 self.inner
81 .set_segmenter_user_dictionary_path(Path::new(path));
82
83 Ok(())
84 }
85
86 #[wasm_bindgen(js_name = "setUserDictionaryKind")]
87 pub fn set_user_dictionary_kind(&mut self, kind: &str) -> Result<(), JsValue> {
88 let k = DictionaryKind::from_str(kind).map_err(|e| JsValue::from_str(&e.to_string()))?;
89 self.inner.set_segmenter_user_dictionary_kind(&k);
90
91 Ok(())
92 }
93
94 #[wasm_bindgen(js_name = "appendCharacterFilter")]
95 pub fn append_character_filter(&mut self, name: &str, args: JsValue) -> Result<(), JsValue> {
96 let a = serde_wasm_bindgen::from_value::<Value>(args)
97 .map_err(|e| JsValue::from_str(&e.to_string()))?;
98
99 self.inner.append_character_filter(name, &a);
100
101 Ok(())
102 }
103
104 #[wasm_bindgen(js_name = "appendTokenFilter")]
105 pub fn append_token_filter(&mut self, name: &str, args: JsValue) -> Result<(), JsValue> {
106 let a = serde_wasm_bindgen::from_value::<Value>(args)
107 .map_err(|e| JsValue::from_str(&e.to_string()))?;
108
109 self.inner.append_token_filter(name, &a);
110
111 Ok(())
112 }
113}
114
115#[wasm_bindgen]
116pub struct Tokenizer {
117 inner: LinderaTokenizer,
118}
119
120#[wasm_bindgen]
121impl Tokenizer {
122 pub fn tokenize(&self, input_text: &str) -> Result<JsValue, JsValue> {
123 let mut tokens = self
124 .inner
125 .tokenize(input_text)
126 .map_err(|e| JsValue::from_str(&e.to_string()))?;
127
128 let js_value = serde_wasm_bindgen::to_value(
129 &tokens
130 .iter_mut()
131 .map(|token| token_to_json(token))
132 .collect::<Vec<_>>(),
133 )
134 .map_err(|e| JsValue::from_str(&e.to_string()))?;
135
136 Ok(js_value)
137 }
138}
139
140#[cfg(test)]
141mod tests {
142 #[cfg(target_arch = "wasm32")]
143 use wasm_bindgen_test::wasm_bindgen_test;
144
145 #[cfg(target_arch = "wasm32")]
146 #[wasm_bindgen_test]
147 fn test_tokenize() {
148 use crate::TokenizerBuilder;
149 use serde_json::Value;
150
151 let mut builder = TokenizerBuilder::new().unwrap();
152 builder.set_mode("normal").unwrap();
153 builder.set_dictionary_kind("ipadic").unwrap();
154
155 let tokenizer = builder.build().unwrap();
156
157 let t = tokenizer.tokenize("関西国際空港限定トートバッグ").unwrap();
158 let tokens: Vec<Value> = serde_wasm_bindgen::from_value(t).unwrap();
159
160 assert_eq!(tokens.len(), 3);
161 assert_eq!(tokens[0].get("text").unwrap(), "関西国際空港");
162 assert_eq!(tokens[1].get("text").unwrap(), "限定");
163 assert_eq!(tokens[2].get("text").unwrap(), "トートバッグ");
164 }
165}