jpreprocess_dictionary/tokenizer/
jpreprocess.rs

1use std::borrow::Cow;
2
3use jpreprocess_core::{error::DictionaryError, word_entry::WordEntry, JPreprocessResult};
4
5use crate::{
6    dictionary::word_encoding::JPreprocessDictionaryWordEncoding, word_data::get_word_data,
7};
8
9use super::{Token, Tokenizer};
10
11pub struct JPreprocessTokenizer {
12    tokenizer: lindera::tokenizer::Tokenizer,
13}
14
15impl JPreprocessTokenizer {
16    pub fn new(tokenizer: lindera::tokenizer::Tokenizer) -> Self {
17        Self { tokenizer }
18    }
19
20    fn get_word(
21        &self,
22        word_id: lindera_dictionary::viterbi::WordId,
23    ) -> Result<WordEntry, DictionaryError> {
24        if word_id.is_unknown() {
25            Ok(WordEntry::default())
26        } else if word_id.is_system() {
27            Self::get_word_from_prefixdict(
28                &self.tokenizer.segmenter.dictionary.prefix_dictionary,
29                word_id,
30            )
31        } else {
32            let user = &self.tokenizer.segmenter.user_dictionary;
33            user.as_ref()
34                .map_or(Err(DictionaryError::UserDictionaryNotProvided), |user| {
35                    Self::get_word_from_prefixdict(&user.dict, word_id)
36                })
37        }
38    }
39
40    /// PANIC: It must be ensured that the prefix_dict is the correct dictionary for the word_id.
41    pub(super) fn get_word_from_prefixdict(
42        prefix_dict: &lindera_dictionary::dictionary::prefix_dictionary::PrefixDictionary,
43        word_id: lindera_dictionary::viterbi::WordId,
44    ) -> Result<WordEntry, DictionaryError> {
45        if word_id.is_unknown() {
46            Ok(WordEntry::default())
47        } else {
48            let data = get_word_data(
49                &prefix_dict.words_idx_data,
50                &prefix_dict.words_data,
51                Some(word_id.id as usize),
52            )
53            .ok_or(DictionaryError::IdNotFound(word_id.id))?;
54            Ok(JPreprocessDictionaryWordEncoding::deserialize(data)?)
55        }
56    }
57}
58
59impl Tokenizer for JPreprocessTokenizer {
60    fn tokenize<'a>(&'a self, text: &'a str) -> JPreprocessResult<Vec<impl 'a + Token>> {
61        let words = self.tokenizer.tokenize(text).unwrap();
62        words
63            .into_iter()
64            .map(|token| {
65                Ok(JPreprocessToken::new(
66                    token.surface,
67                    self.get_word(token.word_id)?,
68                ))
69            })
70            .collect::<Result<_, _>>()
71    }
72}
73
74pub struct JPreprocessToken<'a> {
75    text: Cow<'a, str>,
76    entry: WordEntry,
77}
78
79impl<'a> JPreprocessToken<'a> {
80    pub(crate) fn new(text: Cow<'a, str>, entry: WordEntry) -> Self {
81        Self { text, entry }
82    }
83}
84
85impl Token for JPreprocessToken<'_> {
86    fn fetch(&mut self) -> Result<(&str, WordEntry), jpreprocess_core::JPreprocessError> {
87        Ok((&self.text, self.entry.clone()))
88    }
89}