jpreprocess_dictionary/tokenizer/
jpreprocess.rs1use std::borrow::Cow;
2
3use jpreprocess_core::{error::DictionaryError, word_entry::WordEntry, JPreprocessResult};
4
5use crate::{
6 dictionary::word_encoding::JPreprocessDictionaryWordEncoding, word_data::get_word_data,
7};
8
9use super::{Token, Tokenizer};
10
11pub struct JPreprocessTokenizer {
12 tokenizer: lindera::tokenizer::Tokenizer,
13}
14
15impl JPreprocessTokenizer {
16 pub fn new(tokenizer: lindera::tokenizer::Tokenizer) -> Self {
17 Self { tokenizer }
18 }
19
20 fn get_word(
21 &self,
22 word_id: lindera_dictionary::viterbi::WordId,
23 ) -> Result<WordEntry, DictionaryError> {
24 if word_id.is_unknown() {
25 Ok(WordEntry::default())
26 } else if word_id.is_system() {
27 Self::get_word_from_prefixdict(
28 &self.tokenizer.segmenter.dictionary.prefix_dictionary,
29 word_id,
30 )
31 } else {
32 let user = &self.tokenizer.segmenter.user_dictionary;
33 user.as_ref()
34 .map_or(Err(DictionaryError::UserDictionaryNotProvided), |user| {
35 Self::get_word_from_prefixdict(&user.dict, word_id)
36 })
37 }
38 }
39
40 pub(super) fn get_word_from_prefixdict(
42 prefix_dict: &lindera_dictionary::dictionary::prefix_dictionary::PrefixDictionary,
43 word_id: lindera_dictionary::viterbi::WordId,
44 ) -> Result<WordEntry, DictionaryError> {
45 if word_id.is_unknown() {
46 Ok(WordEntry::default())
47 } else {
48 let data = get_word_data(
49 &prefix_dict.words_idx_data,
50 &prefix_dict.words_data,
51 Some(word_id.id as usize),
52 )
53 .ok_or(DictionaryError::IdNotFound(word_id.id))?;
54 Ok(JPreprocessDictionaryWordEncoding::deserialize(data)?)
55 }
56 }
57}
58
59impl Tokenizer for JPreprocessTokenizer {
60 fn tokenize<'a>(&'a self, text: &'a str) -> JPreprocessResult<Vec<impl 'a + Token>> {
61 let words = self.tokenizer.tokenize(text).unwrap();
62 words
63 .into_iter()
64 .map(|token| {
65 Ok(JPreprocessToken::new(
66 token.surface,
67 self.get_word(token.word_id)?,
68 ))
69 })
70 .collect::<Result<_, _>>()
71 }
72}
73
74pub struct JPreprocessToken<'a> {
75 text: Cow<'a, str>,
76 entry: WordEntry,
77}
78
79impl<'a> JPreprocessToken<'a> {
80 pub(crate) fn new(text: Cow<'a, str>, entry: WordEntry) -> Self {
81 Self { text, entry }
82 }
83}
84
85impl Token for JPreprocessToken<'_> {
86 fn fetch(&mut self) -> Result<(&str, WordEntry), jpreprocess_core::JPreprocessError> {
87 Ok((&self.text, self.entry.clone()))
88 }
89}