lindera_analyzer/
analyzer.rs

1use std::{fs, path::Path};
2
3use serde::Serialize;
4use serde_json::Value;
5
6use lindera_core::error::LinderaErrorKind;
7use lindera_core::LinderaResult;
8use lindera_filter::character_filter::{correct_offset, BoxCharacterFilter, CharacterFilterLoader};
9use lindera_filter::token::Token;
10use lindera_filter::token_filter::{BoxTokenFilter, TokenFilterLoader};
11use lindera_tokenizer::tokenizer::Tokenizer;
12
13#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
14pub struct AnalyzerConfig {
15    inner: Value,
16}
17
18impl AnalyzerConfig {
19    pub fn from_file(path: &Path) -> LinderaResult<Self> {
20        let bytes = fs::read(path).map_err(|err| LinderaErrorKind::Io.with_error(err))?;
21
22        Self::from_slice(&bytes)
23    }
24
25    pub fn from_slice(data: &[u8]) -> LinderaResult<Self> {
26        let args = serde_json::from_slice::<Value>(data)
27            .map_err(|err| LinderaErrorKind::Deserialize.with_error(err))?;
28
29        Ok(Self { inner: args })
30    }
31}
32
33pub struct Analyzer {
34    /// Character filters
35    pub character_filters: Vec<BoxCharacterFilter>,
36
37    /// Tokenizer
38    pub tokenizer: Tokenizer,
39
40    /// Token filters
41    pub token_filters: Vec<BoxTokenFilter>,
42}
43
44impl Analyzer {
45    pub fn from_config(config: &AnalyzerConfig) -> LinderaResult<Self> {
46        let value = &config.inner;
47
48        let mut character_filters: Vec<BoxCharacterFilter> = Vec::new();
49        let character_filter_settings = value["character_filters"].as_array();
50        if let Some(character_filter_settings) = character_filter_settings {
51            for character_filter_setting in character_filter_settings {
52                let character_filter_name = character_filter_setting["kind"].as_str();
53                if let Some(character_filter_name) = character_filter_name {
54                    let character_filter = CharacterFilterLoader::load_from_value(
55                        character_filter_name,
56                        &character_filter_setting["args"],
57                    )?;
58                    character_filters.push(character_filter);
59                }
60            }
61        }
62
63        let args_value = value["tokenizer"].as_object().ok_or_else(|| {
64            LinderaErrorKind::Deserialize.with_error(anyhow::anyhow!("missing tokenizer config."))
65        })?;
66        let arg_bytes = serde_json::to_vec(args_value)
67            .map_err(|err| LinderaErrorKind::Deserialize.with_error(err))?;
68
69        let tokenizer_config = serde_json::from_slice(&arg_bytes)
70            .map_err(|err| LinderaErrorKind::Deserialize.with_error(err))?;
71        let tokenizer = Tokenizer::from_config(tokenizer_config)?;
72
73        let mut token_filters: Vec<BoxTokenFilter> = Vec::new();
74        let token_filter_settings = value["token_filters"].as_array();
75        if let Some(token_filter_settings) = token_filter_settings {
76            for token_filter_setting in token_filter_settings {
77                let token_filter_name = token_filter_setting["kind"].as_str();
78                if let Some(token_filter_name) = token_filter_name {
79                    token_filters.push(TokenFilterLoader::load_from_value(
80                        token_filter_name,
81                        &token_filter_setting["args"],
82                    )?);
83                }
84            }
85        }
86
87        Ok(Self::new(character_filters, tokenizer, token_filters))
88    }
89
90    pub fn new(
91        character_filters: Vec<BoxCharacterFilter>,
92        tokenizer: Tokenizer,
93        token_filters: Vec<BoxTokenFilter>,
94    ) -> Self {
95        Self {
96            character_filters,
97            tokenizer,
98            token_filters,
99        }
100    }
101
102    pub fn analyze(&self, text: &str) -> LinderaResult<Vec<Token>> {
103        let mut normalized_text = text.to_string();
104
105        let mut text_len_vec: Vec<usize> = Vec::new();
106        let mut offsets_vec: Vec<Vec<usize>> = Vec::new();
107        let mut diffs_vec: Vec<Vec<i64>> = Vec::new();
108
109        // Appy character filters.
110        for character_filter in &self.character_filters {
111            let (new_text, offsets, diffs) = character_filter.apply(normalized_text.as_str())?;
112
113            if !offsets.is_empty() {
114                // Record the offsets of each character filter.
115                offsets_vec.insert(0, offsets);
116
117                // Record the diffs of each character filter.
118                diffs_vec.insert(0, diffs);
119
120                // Record the length of the text after each character filter is applied.
121                text_len_vec.insert(0, new_text.len());
122            }
123
124            normalized_text = new_text;
125        }
126
127        // Tokenize.
128        let mut tmp_tokens = self.tokenizer.tokenize(&normalized_text)?;
129
130        // Make analyzed tokens.
131        let mut tokens = Vec::new();
132        for token in tmp_tokens.iter_mut() {
133            tokens.push(Token {
134                text: token.text.to_string(),
135                byte_start: token.byte_start,
136                byte_end: token.byte_end,
137                position: token.position,
138                position_length: token.position_length,
139                word_id: token.word_id,
140                details: token
141                    .get_details()
142                    .ok_or_else(|| {
143                        LinderaErrorKind::Content.with_error(anyhow::anyhow!("unknown error"))
144                    })?
145                    .iter()
146                    .map(|s| s.to_string())
147                    .collect::<Vec<String>>(),
148            });
149        }
150
151        // Apply token filters.
152        for token_filter in &self.token_filters {
153            token_filter.apply(&mut tokens)?;
154        }
155
156        // Correct token offsets
157        for token in tokens.iter_mut() {
158            // Override details.
159            for (i, offsets) in offsets_vec.iter().enumerate() {
160                // Override start.
161                token.byte_start =
162                    correct_offset(token.byte_start, offsets, &diffs_vec[i], text_len_vec[i]);
163
164                // Override end.
165                token.byte_end =
166                    correct_offset(token.byte_end, offsets, &diffs_vec[i], text_len_vec[i]);
167            }
168        }
169
170        Ok(tokens)
171    }
172}
173
174impl Clone for Analyzer {
175    fn clone(&self) -> Self {
176        let mut character_filters: Vec<BoxCharacterFilter> = Vec::new();
177        for character_filter in self.character_filters.iter() {
178            character_filters.push(character_filter.box_clone());
179        }
180
181        let mut token_filters: Vec<BoxTokenFilter> = Vec::new();
182        for token_filter in self.token_filters.iter() {
183            token_filters.push(token_filter.box_clone());
184        }
185
186        Analyzer {
187            character_filters,
188            tokenizer: self.tokenizer.clone(),
189            token_filters,
190        }
191    }
192}
193
194#[cfg(test)]
195mod tests {
196    #[cfg(all(
197        any(feature = "ipadic", feature = "ipadic-neologd"),
198        feature = "filter"
199    ))]
200    use crate::analyzer::{Analyzer, AnalyzerConfig};
201
202    #[test]
203    #[cfg(all(feature = "ipadic", feature = "filter",))]
204    fn test_analyzer_config_from_slice() {
205        let config_str = r#"
206        {
207            "character_filters": [
208                {
209                    "kind": "unicode_normalize",
210                    "args": {
211                        "kind": "nfkc"
212                    }
213                },
214                {
215                    "kind": "mapping",
216                    "args": {
217                        "mapping": {
218                            "リンデラ": "Lindera"
219                        }
220                    }
221                }
222            ],
223            "tokenizer": {
224                "dictionary": {
225                    "kind": "ipadic"
226                },
227                "mode": "normal"
228            },
229            "token_filters": [
230                {
231                    "kind": "japanese_stop_tags",
232                    "args": {
233                        "tags": [
234                            "接続詞",
235                            "助詞",
236                            "助詞,格助詞",
237                            "助詞,格助詞,一般",
238                            "助詞,格助詞,引用",
239                            "助詞,格助詞,連語",
240                            "助詞,係助詞",
241                            "助詞,副助詞",
242                            "助詞,間投助詞",
243                            "助詞,並立助詞",
244                            "助詞,終助詞",
245                            "助詞,副助詞/並立助詞/終助詞",
246                            "助詞,連体化",
247                            "助詞,副詞化",
248                            "助詞,特殊",
249                            "助動詞",
250                            "記号",
251                            "記号,一般",
252                            "記号,読点",
253                            "記号,句点",
254                            "記号,空白",
255                            "記号,括弧閉",
256                            "その他,間投",
257                            "フィラー",
258                            "非言語音"
259                        ]
260                    }
261                },
262                {
263                    "kind": "japanese_katakana_stem",
264                    "args": {
265                        "min": 3
266                    }
267                }
268            ]
269        }
270        "#;
271        let result = AnalyzerConfig::from_slice(config_str.as_bytes());
272
273        assert_eq!(true, result.is_ok());
274    }
275
276    #[test]
277    #[cfg(all(feature = "ipadic", feature = "filter",))]
278    fn test_analyzer_config_clone() {
279        let config_str = r#"
280        {
281            "character_filters": [
282                {
283                    "kind": "unicode_normalize",
284                    "args": {
285                        "kind": "nfkc"
286                    }
287                },
288                {
289                    "kind": "mapping",
290                    "args": {
291                        "mapping": {
292                            "リンデラ": "Lindera"
293                        }
294                    }
295                }
296            ],
297            "tokenizer": {
298                "dictionary": {
299                    "kind": "ipadic"
300                },
301                "mode": "normal"
302            },
303            "token_filters": [
304                {
305                    "kind": "japanese_stop_tags",
306                    "args": {
307                        "tags": [
308                            "接続詞",
309                            "助詞",
310                            "助詞,格助詞",
311                            "助詞,格助詞,一般",
312                            "助詞,格助詞,引用",
313                            "助詞,格助詞,連語",
314                            "助詞,係助詞",
315                            "助詞,副助詞",
316                            "助詞,間投助詞",
317                            "助詞,並立助詞",
318                            "助詞,終助詞",
319                            "助詞,副助詞/並立助詞/終助詞",
320                            "助詞,連体化",
321                            "助詞,副詞化",
322                            "助詞,特殊",
323                            "助動詞",
324                            "記号",
325                            "記号,一般",
326                            "記号,読点",
327                            "記号,句点",
328                            "記号,空白",
329                            "記号,括弧閉",
330                            "その他,間投",
331                            "フィラー",
332                            "非言語音"
333                        ]
334                    }
335                },
336                {
337                    "kind": "japanese_katakana_stem",
338                    "args": {
339                        "min": 3
340                    }
341                }
342            ]
343        }
344        "#;
345        let analyzer_config = AnalyzerConfig::from_slice(config_str.as_bytes()).unwrap();
346
347        let cloned_analyzer_config = analyzer_config.clone();
348
349        assert_eq!(analyzer_config.inner, cloned_analyzer_config.inner);
350    }
351
352    #[test]
353    #[cfg(all(feature = "ipadic", feature = "filter",))]
354    fn test_ipadic_analyzer_analyze() {
355        let config_str = r#"
356        {
357            "character_filters": [
358                {
359                    "kind": "unicode_normalize",
360                    "args": {
361                        "kind": "nfkc"
362                    }
363                },
364                {
365                    "kind": "japanese_iteration_mark",
366                    "args": {
367                        "normalize_kanji": true,
368                        "normalize_kana": true
369                    }
370                },
371                {
372                    "kind": "mapping",
373                    "args": {
374                        "mapping": {
375                            "リンデラ": "Lindera"
376                        }
377                    }
378                }
379            ],
380            "tokenizer": {
381                "dictionary": {
382                    "kind": "ipadic"
383                },
384                "mode": "normal"
385            },
386            "token_filters": [
387                {
388                    "kind": "japanese_compound_word",
389                    "args": {
390                        "kind": "ipadic",
391                        "tags": [
392                            "名詞,数",
393                            "名詞,接尾,助数詞"
394                        ]
395                    }
396                },
397                {
398                    "kind": "japanese_stop_tags",
399                    "args": {
400                        "tags": [
401                            "接続詞",
402                            "助詞",
403                            "助詞,格助詞",
404                            "助詞,格助詞,一般",
405                            "助詞,格助詞,引用",
406                            "助詞,格助詞,連語",
407                            "助詞,係助詞",
408                            "助詞,副助詞",
409                            "助詞,間投助詞",
410                            "助詞,並立助詞",
411                            "助詞,終助詞",
412                            "助詞,副助詞/並立助詞/終助詞",
413                            "助詞,連体化",
414                            "助詞,副詞化",
415                            "助詞,特殊",
416                            "助動詞",
417                            "記号",
418                            "記号,一般",
419                            "記号,読点",
420                            "記号,句点",
421                            "記号,空白",
422                            "記号,括弧閉",
423                            "その他,間投",
424                            "フィラー",
425                            "非言語音"
426                        ]
427                    }
428                },
429                {
430                    "kind": "japanese_katakana_stem",
431                    "args": {
432                        "min": 3
433                    }
434                }
435            ]
436        }
437        "#;
438        let analyzer_config = AnalyzerConfig::from_slice(config_str.as_bytes()).unwrap();
439
440        let analyzer = Analyzer::from_config(&analyzer_config).unwrap();
441
442        {
443            let text = "リンデラは形態素解析エンジンです。".to_string();
444            let mut analyze_text = text.clone();
445            let mut tokens = analyzer.analyze(&mut analyze_text).unwrap();
446            let mut tokens_iter = tokens.iter_mut();
447            {
448                let token = tokens_iter.next().unwrap();
449                assert_eq!(token.text, "Lindera".to_string());
450                assert_eq!(token.byte_start, 0);
451                assert_eq!(token.byte_end, 15);
452                assert_eq!(token.position, 0);
453                assert_eq!(token.position_length, 1);
454                assert_eq!(token.details, vec!["UNK".to_string()]);
455            }
456            {
457                let token = tokens_iter.next().unwrap();
458                assert_eq!(token.text, "形態素".to_string());
459                assert_eq!(token.byte_start, 18);
460                assert_eq!(token.byte_end, 27);
461                assert_eq!(token.position, 2);
462                assert_eq!(token.position_length, 1);
463                assert_eq!(
464                    token.details,
465                    vec![
466                        "名詞".to_string(),
467                        "一般".to_string(),
468                        "*".to_string(),
469                        "*".to_string(),
470                        "*".to_string(),
471                        "*".to_string(),
472                        "形態素".to_string(),
473                        "ケイタイソ".to_string(),
474                        "ケイタイソ".to_string()
475                    ]
476                );
477            }
478            {
479                let token = tokens_iter.next().unwrap();
480                assert_eq!(token.text, "解析".to_string());
481                assert_eq!(token.byte_start, 27);
482                assert_eq!(token.byte_end, 33);
483                assert_eq!(token.position, 3);
484                assert_eq!(token.position_length, 1);
485                assert_eq!(
486                    token.details,
487                    vec![
488                        "名詞".to_string(),
489                        "サ変接続".to_string(),
490                        "*".to_string(),
491                        "*".to_string(),
492                        "*".to_string(),
493                        "*".to_string(),
494                        "解析".to_string(),
495                        "カイセキ".to_string(),
496                        "カイセキ".to_string()
497                    ]
498                );
499            }
500            {
501                let token = tokens_iter.next().unwrap();
502                assert_eq!(token.text, "エンジン".to_string());
503                assert_eq!(token.byte_start, 33);
504                assert_eq!(token.byte_end, 48);
505                assert_eq!(token.position, 4);
506                assert_eq!(token.position_length, 1);
507                assert_eq!(
508                    token.details,
509                    vec![
510                        "名詞".to_string(),
511                        "一般".to_string(),
512                        "*".to_string(),
513                        "*".to_string(),
514                        "*".to_string(),
515                        "*".to_string(),
516                        "エンジン".to_string(),
517                        "エンジン".to_string(),
518                        "エンジン".to_string()
519                    ]
520                );
521            }
522
523            let mut tokens_iter = tokens.iter();
524            {
525                let token = tokens_iter.next().unwrap();
526                let start = token.byte_start;
527                let end = token.byte_end;
528                assert_eq!(token.text, "Lindera".to_string());
529                assert_eq!(&text[start..end], "リンデラ");
530            }
531        }
532
533        {
534            let text = "10㌎のガソリン".to_string();
535            let mut analyze_text = text.clone();
536            let mut tokens = analyzer.analyze(&mut analyze_text).unwrap();
537            let mut tokens_iter = tokens.iter_mut();
538            {
539                let token = tokens_iter.next().unwrap();
540                assert_eq!(token.text, "10".to_string());
541                assert_eq!(token.byte_start, 0);
542                assert_eq!(token.byte_end, 6);
543                assert_eq!(token.position, 0);
544                assert_eq!(token.position_length, 1);
545                assert_eq!(token.details, vec!["UNK".to_string()]);
546            }
547            {
548                let token = tokens_iter.next().unwrap();
549                assert_eq!(token.text, "ガロン".to_string());
550                assert_eq!(token.byte_start, 6);
551                assert_eq!(token.byte_end, 9);
552                assert_eq!(token.position, 1);
553                assert_eq!(token.position_length, 1);
554                assert_eq!(
555                    token.details,
556                    vec![
557                        "名詞".to_string(),
558                        "接尾".to_string(),
559                        "助数詞".to_string(),
560                        "*".to_string(),
561                        "*".to_string(),
562                        "*".to_string(),
563                        "ガロン".to_string(),
564                        "ガロン".to_string(),
565                        "ガロン".to_string()
566                    ]
567                );
568            }
569            {
570                let token = tokens_iter.next().unwrap();
571                assert_eq!(token.text, "ガソリン".to_string());
572                assert_eq!(token.byte_start, 12);
573                assert_eq!(token.byte_end, 27);
574                assert_eq!(token.position, 3);
575                assert_eq!(token.position_length, 1);
576                assert_eq!(
577                    token.details,
578                    vec![
579                        "名詞".to_string(),
580                        "一般".to_string(),
581                        "*".to_string(),
582                        "*".to_string(),
583                        "*".to_string(),
584                        "*".to_string(),
585                        "ガソリン".to_string(),
586                        "ガソリン".to_string(),
587                        "ガソリン".to_string()
588                    ]
589                );
590            }
591
592            let mut tokens_iter = tokens.iter();
593            {
594                let token = tokens_iter.next().unwrap();
595                let start = token.byte_start;
596                let end = token.byte_end;
597                assert_eq!(token.text, "10".to_string());
598                assert_eq!(&text[start..end], "10");
599            }
600            {
601                let token = tokens_iter.next().unwrap();
602                let start = token.byte_start;
603                let end = token.byte_end;
604                assert_eq!(token.text, "ガロン".to_string());
605                assert_eq!(&text[start..end], "㌎");
606            }
607            {
608                let token = tokens_iter.next().unwrap();
609                let start = token.byte_start;
610                let end = token.byte_end;
611                assert_eq!(token.text, "ガソリン".to_string());
612                assert_eq!(&text[start..end], "ガソリン");
613            }
614        }
615
616        {
617            let text = "お釣りは百三十四円です。".to_string();
618            let mut analyze_text = text.clone();
619            let mut tokens = analyzer.analyze(&mut analyze_text).unwrap();
620            let mut tokens_iter = tokens.iter_mut();
621            {
622                let token = tokens_iter.next().unwrap();
623                assert_eq!(token.text, "お釣り".to_string());
624                assert_eq!(token.byte_start, 0);
625                assert_eq!(token.byte_end, 9);
626                assert_eq!(token.position, 0);
627                assert_eq!(token.position_length, 1);
628                assert_eq!(
629                    token.details,
630                    vec![
631                        "名詞".to_string(),
632                        "一般".to_string(),
633                        "*".to_string(),
634                        "*".to_string(),
635                        "*".to_string(),
636                        "*".to_string(),
637                        "お釣り".to_string(),
638                        "オツリ".to_string(),
639                        "オツリ".to_string()
640                    ]
641                );
642            }
643            {
644                let token = tokens_iter.next().unwrap();
645                assert_eq!(token.text, "百三十四円".to_string());
646                assert_eq!(token.byte_start, 12);
647                assert_eq!(token.byte_end, 27);
648                assert_eq!(token.position, 2);
649                assert_eq!(token.position_length, 5);
650                assert_eq!(
651                    token.details,
652                    vec![
653                        "複合語".to_string(),
654                        "*".to_string(),
655                        "*".to_string(),
656                        "*".to_string(),
657                        "*".to_string(),
658                        "*".to_string(),
659                        "*".to_string(),
660                        "*".to_string(),
661                        "*".to_string()
662                    ]
663                );
664            }
665        }
666
667        {
668            let text = "ここは騒々しい".to_string();
669            let mut analyze_text = text.clone();
670            let mut tokens = analyzer.analyze(&mut analyze_text).unwrap();
671            let mut tokens_iter = tokens.iter_mut();
672            {
673                let token = tokens_iter.next().unwrap();
674                assert_eq!(token.text, "ここ".to_string());
675                assert_eq!(token.byte_start, 0);
676                assert_eq!(token.byte_end, 6);
677                assert_eq!(token.position, 0);
678                assert_eq!(token.position_length, 1);
679                assert_eq!(
680                    token.details,
681                    vec![
682                        "名詞".to_string(),
683                        "代名詞".to_string(),
684                        "一般".to_string(),
685                        "*".to_string(),
686                        "*".to_string(),
687                        "*".to_string(),
688                        "ここ".to_string(),
689                        "ココ".to_string(),
690                        "ココ".to_string()
691                    ]
692                );
693            }
694            {
695                let token = tokens_iter.next().unwrap();
696                assert_eq!(token.text, "騒騒しい".to_string());
697                assert_eq!(token.byte_start, 9);
698                assert_eq!(token.byte_end, 21);
699                assert_eq!(token.position, 2);
700                assert_eq!(token.position_length, 1);
701                assert_eq!(
702                    token.details,
703                    vec![
704                        "形容詞".to_string(),
705                        "自立".to_string(),
706                        "*".to_string(),
707                        "*".to_string(),
708                        "形容詞・イ段".to_string(),
709                        "基本形".to_string(),
710                        "騒騒しい".to_string(),
711                        "ソウゾウシイ".to_string(),
712                        "ソーゾーシイ".to_string()
713                    ]
714                );
715            }
716        }
717    }
718}