1use std::{fs, path::Path};
2
3use serde::Serialize;
4use serde_json::Value;
5
6use lindera_core::error::LinderaErrorKind;
7use lindera_core::LinderaResult;
8use lindera_filter::character_filter::{correct_offset, BoxCharacterFilter, CharacterFilterLoader};
9use lindera_filter::token::Token;
10use lindera_filter::token_filter::{BoxTokenFilter, TokenFilterLoader};
11use lindera_tokenizer::tokenizer::Tokenizer;
12
13#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
14pub struct AnalyzerConfig {
15 inner: Value,
16}
17
18impl AnalyzerConfig {
19 pub fn from_file(path: &Path) -> LinderaResult<Self> {
20 let bytes = fs::read(path).map_err(|err| LinderaErrorKind::Io.with_error(err))?;
21
22 Self::from_slice(&bytes)
23 }
24
25 pub fn from_slice(data: &[u8]) -> LinderaResult<Self> {
26 let args = serde_json::from_slice::<Value>(data)
27 .map_err(|err| LinderaErrorKind::Deserialize.with_error(err))?;
28
29 Ok(Self { inner: args })
30 }
31}
32
33pub struct Analyzer {
34 pub character_filters: Vec<BoxCharacterFilter>,
36
37 pub tokenizer: Tokenizer,
39
40 pub token_filters: Vec<BoxTokenFilter>,
42}
43
44impl Analyzer {
45 pub fn from_config(config: &AnalyzerConfig) -> LinderaResult<Self> {
46 let value = &config.inner;
47
48 let mut character_filters: Vec<BoxCharacterFilter> = Vec::new();
49 let character_filter_settings = value["character_filters"].as_array();
50 if let Some(character_filter_settings) = character_filter_settings {
51 for character_filter_setting in character_filter_settings {
52 let character_filter_name = character_filter_setting["kind"].as_str();
53 if let Some(character_filter_name) = character_filter_name {
54 let character_filter = CharacterFilterLoader::load_from_value(
55 character_filter_name,
56 &character_filter_setting["args"],
57 )?;
58 character_filters.push(character_filter);
59 }
60 }
61 }
62
63 let args_value = value["tokenizer"].as_object().ok_or_else(|| {
64 LinderaErrorKind::Deserialize.with_error(anyhow::anyhow!("missing tokenizer config."))
65 })?;
66 let arg_bytes = serde_json::to_vec(args_value)
67 .map_err(|err| LinderaErrorKind::Deserialize.with_error(err))?;
68
69 let tokenizer_config = serde_json::from_slice(&arg_bytes)
70 .map_err(|err| LinderaErrorKind::Deserialize.with_error(err))?;
71 let tokenizer = Tokenizer::from_config(tokenizer_config)?;
72
73 let mut token_filters: Vec<BoxTokenFilter> = Vec::new();
74 let token_filter_settings = value["token_filters"].as_array();
75 if let Some(token_filter_settings) = token_filter_settings {
76 for token_filter_setting in token_filter_settings {
77 let token_filter_name = token_filter_setting["kind"].as_str();
78 if let Some(token_filter_name) = token_filter_name {
79 token_filters.push(TokenFilterLoader::load_from_value(
80 token_filter_name,
81 &token_filter_setting["args"],
82 )?);
83 }
84 }
85 }
86
87 Ok(Self::new(character_filters, tokenizer, token_filters))
88 }
89
90 pub fn new(
91 character_filters: Vec<BoxCharacterFilter>,
92 tokenizer: Tokenizer,
93 token_filters: Vec<BoxTokenFilter>,
94 ) -> Self {
95 Self {
96 character_filters,
97 tokenizer,
98 token_filters,
99 }
100 }
101
102 pub fn analyze(&self, text: &str) -> LinderaResult<Vec<Token>> {
103 let mut normalized_text = text.to_string();
104
105 let mut text_len_vec: Vec<usize> = Vec::new();
106 let mut offsets_vec: Vec<Vec<usize>> = Vec::new();
107 let mut diffs_vec: Vec<Vec<i64>> = Vec::new();
108
109 for character_filter in &self.character_filters {
111 let (new_text, offsets, diffs) = character_filter.apply(normalized_text.as_str())?;
112
113 if !offsets.is_empty() {
114 offsets_vec.insert(0, offsets);
116
117 diffs_vec.insert(0, diffs);
119
120 text_len_vec.insert(0, new_text.len());
122 }
123
124 normalized_text = new_text;
125 }
126
127 let mut tmp_tokens = self.tokenizer.tokenize(&normalized_text)?;
129
130 let mut tokens = Vec::new();
132 for token in tmp_tokens.iter_mut() {
133 tokens.push(Token {
134 text: token.text.to_string(),
135 byte_start: token.byte_start,
136 byte_end: token.byte_end,
137 position: token.position,
138 position_length: token.position_length,
139 word_id: token.word_id,
140 details: token
141 .get_details()
142 .ok_or_else(|| {
143 LinderaErrorKind::Content.with_error(anyhow::anyhow!("unknown error"))
144 })?
145 .iter()
146 .map(|s| s.to_string())
147 .collect::<Vec<String>>(),
148 });
149 }
150
151 for token_filter in &self.token_filters {
153 token_filter.apply(&mut tokens)?;
154 }
155
156 for token in tokens.iter_mut() {
158 for (i, offsets) in offsets_vec.iter().enumerate() {
160 token.byte_start =
162 correct_offset(token.byte_start, offsets, &diffs_vec[i], text_len_vec[i]);
163
164 token.byte_end =
166 correct_offset(token.byte_end, offsets, &diffs_vec[i], text_len_vec[i]);
167 }
168 }
169
170 Ok(tokens)
171 }
172}
173
174impl Clone for Analyzer {
175 fn clone(&self) -> Self {
176 let mut character_filters: Vec<BoxCharacterFilter> = Vec::new();
177 for character_filter in self.character_filters.iter() {
178 character_filters.push(character_filter.box_clone());
179 }
180
181 let mut token_filters: Vec<BoxTokenFilter> = Vec::new();
182 for token_filter in self.token_filters.iter() {
183 token_filters.push(token_filter.box_clone());
184 }
185
186 Analyzer {
187 character_filters,
188 tokenizer: self.tokenizer.clone(),
189 token_filters,
190 }
191 }
192}
193
194#[cfg(test)]
195mod tests {
196 #[cfg(all(
197 any(feature = "ipadic", feature = "ipadic-neologd"),
198 feature = "filter"
199 ))]
200 use crate::analyzer::{Analyzer, AnalyzerConfig};
201
202 #[test]
203 #[cfg(all(feature = "ipadic", feature = "filter",))]
204 fn test_analyzer_config_from_slice() {
205 let config_str = r#"
206 {
207 "character_filters": [
208 {
209 "kind": "unicode_normalize",
210 "args": {
211 "kind": "nfkc"
212 }
213 },
214 {
215 "kind": "mapping",
216 "args": {
217 "mapping": {
218 "リンデラ": "Lindera"
219 }
220 }
221 }
222 ],
223 "tokenizer": {
224 "dictionary": {
225 "kind": "ipadic"
226 },
227 "mode": "normal"
228 },
229 "token_filters": [
230 {
231 "kind": "japanese_stop_tags",
232 "args": {
233 "tags": [
234 "接続詞",
235 "助詞",
236 "助詞,格助詞",
237 "助詞,格助詞,一般",
238 "助詞,格助詞,引用",
239 "助詞,格助詞,連語",
240 "助詞,係助詞",
241 "助詞,副助詞",
242 "助詞,間投助詞",
243 "助詞,並立助詞",
244 "助詞,終助詞",
245 "助詞,副助詞/並立助詞/終助詞",
246 "助詞,連体化",
247 "助詞,副詞化",
248 "助詞,特殊",
249 "助動詞",
250 "記号",
251 "記号,一般",
252 "記号,読点",
253 "記号,句点",
254 "記号,空白",
255 "記号,括弧閉",
256 "その他,間投",
257 "フィラー",
258 "非言語音"
259 ]
260 }
261 },
262 {
263 "kind": "japanese_katakana_stem",
264 "args": {
265 "min": 3
266 }
267 }
268 ]
269 }
270 "#;
271 let result = AnalyzerConfig::from_slice(config_str.as_bytes());
272
273 assert_eq!(true, result.is_ok());
274 }
275
276 #[test]
277 #[cfg(all(feature = "ipadic", feature = "filter",))]
278 fn test_analyzer_config_clone() {
279 let config_str = r#"
280 {
281 "character_filters": [
282 {
283 "kind": "unicode_normalize",
284 "args": {
285 "kind": "nfkc"
286 }
287 },
288 {
289 "kind": "mapping",
290 "args": {
291 "mapping": {
292 "リンデラ": "Lindera"
293 }
294 }
295 }
296 ],
297 "tokenizer": {
298 "dictionary": {
299 "kind": "ipadic"
300 },
301 "mode": "normal"
302 },
303 "token_filters": [
304 {
305 "kind": "japanese_stop_tags",
306 "args": {
307 "tags": [
308 "接続詞",
309 "助詞",
310 "助詞,格助詞",
311 "助詞,格助詞,一般",
312 "助詞,格助詞,引用",
313 "助詞,格助詞,連語",
314 "助詞,係助詞",
315 "助詞,副助詞",
316 "助詞,間投助詞",
317 "助詞,並立助詞",
318 "助詞,終助詞",
319 "助詞,副助詞/並立助詞/終助詞",
320 "助詞,連体化",
321 "助詞,副詞化",
322 "助詞,特殊",
323 "助動詞",
324 "記号",
325 "記号,一般",
326 "記号,読点",
327 "記号,句点",
328 "記号,空白",
329 "記号,括弧閉",
330 "その他,間投",
331 "フィラー",
332 "非言語音"
333 ]
334 }
335 },
336 {
337 "kind": "japanese_katakana_stem",
338 "args": {
339 "min": 3
340 }
341 }
342 ]
343 }
344 "#;
345 let analyzer_config = AnalyzerConfig::from_slice(config_str.as_bytes()).unwrap();
346
347 let cloned_analyzer_config = analyzer_config.clone();
348
349 assert_eq!(analyzer_config.inner, cloned_analyzer_config.inner);
350 }
351
352 #[test]
353 #[cfg(all(feature = "ipadic", feature = "filter",))]
354 fn test_ipadic_analyzer_analyze() {
355 let config_str = r#"
356 {
357 "character_filters": [
358 {
359 "kind": "unicode_normalize",
360 "args": {
361 "kind": "nfkc"
362 }
363 },
364 {
365 "kind": "japanese_iteration_mark",
366 "args": {
367 "normalize_kanji": true,
368 "normalize_kana": true
369 }
370 },
371 {
372 "kind": "mapping",
373 "args": {
374 "mapping": {
375 "リンデラ": "Lindera"
376 }
377 }
378 }
379 ],
380 "tokenizer": {
381 "dictionary": {
382 "kind": "ipadic"
383 },
384 "mode": "normal"
385 },
386 "token_filters": [
387 {
388 "kind": "japanese_compound_word",
389 "args": {
390 "kind": "ipadic",
391 "tags": [
392 "名詞,数",
393 "名詞,接尾,助数詞"
394 ]
395 }
396 },
397 {
398 "kind": "japanese_stop_tags",
399 "args": {
400 "tags": [
401 "接続詞",
402 "助詞",
403 "助詞,格助詞",
404 "助詞,格助詞,一般",
405 "助詞,格助詞,引用",
406 "助詞,格助詞,連語",
407 "助詞,係助詞",
408 "助詞,副助詞",
409 "助詞,間投助詞",
410 "助詞,並立助詞",
411 "助詞,終助詞",
412 "助詞,副助詞/並立助詞/終助詞",
413 "助詞,連体化",
414 "助詞,副詞化",
415 "助詞,特殊",
416 "助動詞",
417 "記号",
418 "記号,一般",
419 "記号,読点",
420 "記号,句点",
421 "記号,空白",
422 "記号,括弧閉",
423 "その他,間投",
424 "フィラー",
425 "非言語音"
426 ]
427 }
428 },
429 {
430 "kind": "japanese_katakana_stem",
431 "args": {
432 "min": 3
433 }
434 }
435 ]
436 }
437 "#;
438 let analyzer_config = AnalyzerConfig::from_slice(config_str.as_bytes()).unwrap();
439
440 let analyzer = Analyzer::from_config(&analyzer_config).unwrap();
441
442 {
443 let text = "リンデラは形態素解析エンジンです。".to_string();
444 let mut analyze_text = text.clone();
445 let mut tokens = analyzer.analyze(&mut analyze_text).unwrap();
446 let mut tokens_iter = tokens.iter_mut();
447 {
448 let token = tokens_iter.next().unwrap();
449 assert_eq!(token.text, "Lindera".to_string());
450 assert_eq!(token.byte_start, 0);
451 assert_eq!(token.byte_end, 15);
452 assert_eq!(token.position, 0);
453 assert_eq!(token.position_length, 1);
454 assert_eq!(token.details, vec!["UNK".to_string()]);
455 }
456 {
457 let token = tokens_iter.next().unwrap();
458 assert_eq!(token.text, "形態素".to_string());
459 assert_eq!(token.byte_start, 18);
460 assert_eq!(token.byte_end, 27);
461 assert_eq!(token.position, 2);
462 assert_eq!(token.position_length, 1);
463 assert_eq!(
464 token.details,
465 vec![
466 "名詞".to_string(),
467 "一般".to_string(),
468 "*".to_string(),
469 "*".to_string(),
470 "*".to_string(),
471 "*".to_string(),
472 "形態素".to_string(),
473 "ケイタイソ".to_string(),
474 "ケイタイソ".to_string()
475 ]
476 );
477 }
478 {
479 let token = tokens_iter.next().unwrap();
480 assert_eq!(token.text, "解析".to_string());
481 assert_eq!(token.byte_start, 27);
482 assert_eq!(token.byte_end, 33);
483 assert_eq!(token.position, 3);
484 assert_eq!(token.position_length, 1);
485 assert_eq!(
486 token.details,
487 vec![
488 "名詞".to_string(),
489 "サ変接続".to_string(),
490 "*".to_string(),
491 "*".to_string(),
492 "*".to_string(),
493 "*".to_string(),
494 "解析".to_string(),
495 "カイセキ".to_string(),
496 "カイセキ".to_string()
497 ]
498 );
499 }
500 {
501 let token = tokens_iter.next().unwrap();
502 assert_eq!(token.text, "エンジン".to_string());
503 assert_eq!(token.byte_start, 33);
504 assert_eq!(token.byte_end, 48);
505 assert_eq!(token.position, 4);
506 assert_eq!(token.position_length, 1);
507 assert_eq!(
508 token.details,
509 vec![
510 "名詞".to_string(),
511 "一般".to_string(),
512 "*".to_string(),
513 "*".to_string(),
514 "*".to_string(),
515 "*".to_string(),
516 "エンジン".to_string(),
517 "エンジン".to_string(),
518 "エンジン".to_string()
519 ]
520 );
521 }
522
523 let mut tokens_iter = tokens.iter();
524 {
525 let token = tokens_iter.next().unwrap();
526 let start = token.byte_start;
527 let end = token.byte_end;
528 assert_eq!(token.text, "Lindera".to_string());
529 assert_eq!(&text[start..end], "リンデラ");
530 }
531 }
532
533 {
534 let text = "10㌎のガソリン".to_string();
535 let mut analyze_text = text.clone();
536 let mut tokens = analyzer.analyze(&mut analyze_text).unwrap();
537 let mut tokens_iter = tokens.iter_mut();
538 {
539 let token = tokens_iter.next().unwrap();
540 assert_eq!(token.text, "10".to_string());
541 assert_eq!(token.byte_start, 0);
542 assert_eq!(token.byte_end, 6);
543 assert_eq!(token.position, 0);
544 assert_eq!(token.position_length, 1);
545 assert_eq!(token.details, vec!["UNK".to_string()]);
546 }
547 {
548 let token = tokens_iter.next().unwrap();
549 assert_eq!(token.text, "ガロン".to_string());
550 assert_eq!(token.byte_start, 6);
551 assert_eq!(token.byte_end, 9);
552 assert_eq!(token.position, 1);
553 assert_eq!(token.position_length, 1);
554 assert_eq!(
555 token.details,
556 vec![
557 "名詞".to_string(),
558 "接尾".to_string(),
559 "助数詞".to_string(),
560 "*".to_string(),
561 "*".to_string(),
562 "*".to_string(),
563 "ガロン".to_string(),
564 "ガロン".to_string(),
565 "ガロン".to_string()
566 ]
567 );
568 }
569 {
570 let token = tokens_iter.next().unwrap();
571 assert_eq!(token.text, "ガソリン".to_string());
572 assert_eq!(token.byte_start, 12);
573 assert_eq!(token.byte_end, 27);
574 assert_eq!(token.position, 3);
575 assert_eq!(token.position_length, 1);
576 assert_eq!(
577 token.details,
578 vec![
579 "名詞".to_string(),
580 "一般".to_string(),
581 "*".to_string(),
582 "*".to_string(),
583 "*".to_string(),
584 "*".to_string(),
585 "ガソリン".to_string(),
586 "ガソリン".to_string(),
587 "ガソリン".to_string()
588 ]
589 );
590 }
591
592 let mut tokens_iter = tokens.iter();
593 {
594 let token = tokens_iter.next().unwrap();
595 let start = token.byte_start;
596 let end = token.byte_end;
597 assert_eq!(token.text, "10".to_string());
598 assert_eq!(&text[start..end], "10");
599 }
600 {
601 let token = tokens_iter.next().unwrap();
602 let start = token.byte_start;
603 let end = token.byte_end;
604 assert_eq!(token.text, "ガロン".to_string());
605 assert_eq!(&text[start..end], "㌎");
606 }
607 {
608 let token = tokens_iter.next().unwrap();
609 let start = token.byte_start;
610 let end = token.byte_end;
611 assert_eq!(token.text, "ガソリン".to_string());
612 assert_eq!(&text[start..end], "ガソリン");
613 }
614 }
615
616 {
617 let text = "お釣りは百三十四円です。".to_string();
618 let mut analyze_text = text.clone();
619 let mut tokens = analyzer.analyze(&mut analyze_text).unwrap();
620 let mut tokens_iter = tokens.iter_mut();
621 {
622 let token = tokens_iter.next().unwrap();
623 assert_eq!(token.text, "お釣り".to_string());
624 assert_eq!(token.byte_start, 0);
625 assert_eq!(token.byte_end, 9);
626 assert_eq!(token.position, 0);
627 assert_eq!(token.position_length, 1);
628 assert_eq!(
629 token.details,
630 vec![
631 "名詞".to_string(),
632 "一般".to_string(),
633 "*".to_string(),
634 "*".to_string(),
635 "*".to_string(),
636 "*".to_string(),
637 "お釣り".to_string(),
638 "オツリ".to_string(),
639 "オツリ".to_string()
640 ]
641 );
642 }
643 {
644 let token = tokens_iter.next().unwrap();
645 assert_eq!(token.text, "百三十四円".to_string());
646 assert_eq!(token.byte_start, 12);
647 assert_eq!(token.byte_end, 27);
648 assert_eq!(token.position, 2);
649 assert_eq!(token.position_length, 5);
650 assert_eq!(
651 token.details,
652 vec![
653 "複合語".to_string(),
654 "*".to_string(),
655 "*".to_string(),
656 "*".to_string(),
657 "*".to_string(),
658 "*".to_string(),
659 "*".to_string(),
660 "*".to_string(),
661 "*".to_string()
662 ]
663 );
664 }
665 }
666
667 {
668 let text = "ここは騒々しい".to_string();
669 let mut analyze_text = text.clone();
670 let mut tokens = analyzer.analyze(&mut analyze_text).unwrap();
671 let mut tokens_iter = tokens.iter_mut();
672 {
673 let token = tokens_iter.next().unwrap();
674 assert_eq!(token.text, "ここ".to_string());
675 assert_eq!(token.byte_start, 0);
676 assert_eq!(token.byte_end, 6);
677 assert_eq!(token.position, 0);
678 assert_eq!(token.position_length, 1);
679 assert_eq!(
680 token.details,
681 vec![
682 "名詞".to_string(),
683 "代名詞".to_string(),
684 "一般".to_string(),
685 "*".to_string(),
686 "*".to_string(),
687 "*".to_string(),
688 "ここ".to_string(),
689 "ココ".to_string(),
690 "ココ".to_string()
691 ]
692 );
693 }
694 {
695 let token = tokens_iter.next().unwrap();
696 assert_eq!(token.text, "騒騒しい".to_string());
697 assert_eq!(token.byte_start, 9);
698 assert_eq!(token.byte_end, 21);
699 assert_eq!(token.position, 2);
700 assert_eq!(token.position_length, 1);
701 assert_eq!(
702 token.details,
703 vec![
704 "形容詞".to_string(),
705 "自立".to_string(),
706 "*".to_string(),
707 "*".to_string(),
708 "形容詞・イ段".to_string(),
709 "基本形".to_string(),
710 "騒騒しい".to_string(),
711 "ソウゾウシイ".to_string(),
712 "ソーゾーシイ".to_string()
713 ]
714 );
715 }
716 }
717 }
718}