summa_core/components/tokenizers/
dict_tokenizer.rs

1use aho_corasick::MatchKind;
2use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
3
4/// Tokenize the text by splitting on whitespaces and punctuation.
5#[derive(Clone)]
6pub struct DictTokenizer {
7    ac: aho_corasick::AhoCorasick,
8    words: Vec<String>,
9    dict: Vec<usize>,
10}
11
12impl DictTokenizer {
13    pub fn new() -> DictTokenizer {
14        let mut synsets = vec![];
15        let mut csv_reader = csv::ReaderBuilder::new()
16            .has_headers(false)
17            .flexible(true)
18            .from_reader(include_bytes!("../../../resources/drugs.csv").as_slice());
19        for record in csv_reader.records() {
20            let mut synset = vec![];
21            for word in record.expect("dictionary is broken").iter() {
22                synset.push(word.to_string())
23            }
24            synsets.push(synset);
25        }
26
27        let mut base_offset = 0;
28        let mut dict = vec![];
29        let words: Vec<String> = synsets
30            .into_iter()
31            .flat_map(|synset| {
32                dict.extend(std::iter::repeat(base_offset).take(synset.len()));
33                base_offset += synset.len();
34                synset
35            })
36            .collect();
37        let ac = aho_corasick::AhoCorasickBuilder::new()
38            .ascii_case_insensitive(true)
39            .match_kind(MatchKind::LeftmostLongest)
40            .build(words.iter())
41            .expect("internal error");
42        DictTokenizer { ac, words, dict }
43    }
44}
45
46impl Default for DictTokenizer {
47    fn default() -> Self {
48        DictTokenizer::new()
49    }
50}
51
52pub struct DictTokenStream<'a> {
53    text: &'a str,
54    words: &'a Vec<String>,
55    dict: &'a Vec<usize>,
56    ah_iter: aho_corasick::FindIter<'a, 'a>,
57    token: Token,
58}
59
60impl<'a> DictTokenStream<'a> {
61    pub fn new(text: &'a str, words: &'a Vec<String>, dict: &'a Vec<usize>, ac: &'a aho_corasick::AhoCorasick) -> DictTokenStream<'a> {
62        DictTokenStream {
63            text,
64            words,
65            dict,
66            ah_iter: ac.find_iter(text),
67            token: Token::default(),
68        }
69    }
70}
71
72impl Tokenizer for DictTokenizer {
73    type TokenStream<'a> = DictTokenStream<'a>;
74
75    fn token_stream<'a>(&'a mut self, text: &'a str) -> DictTokenStream<'a> {
76        DictTokenStream::new(text, &self.words, &self.dict, &self.ac)
77    }
78}
79
80impl<'a> TokenStream for DictTokenStream<'a> {
81    fn advance(&mut self) -> bool {
82        self.token.text.clear();
83        self.token.position = self.token.position.wrapping_add(1);
84        for pattern in self.ah_iter.by_ref() {
85            let properly_beginning = pattern.start() == 0
86                || self.text.as_bytes()[pattern.start() - 1].is_ascii_punctuation()
87                || self.text.as_bytes()[pattern.start() - 1].is_ascii_whitespace();
88            let properly_ending = pattern.end() == self.text.len()
89                || self.text.as_bytes()[pattern.end()].is_ascii_punctuation()
90                || self.text.as_bytes()[pattern.end()].is_ascii_whitespace();
91            if properly_beginning && properly_ending {
92                self.token.offset_from = pattern.start();
93                self.token.offset_to = pattern.end();
94                self.token.text.push_str(&self.words[self.dict[pattern.pattern()]]);
95                return true;
96            }
97        }
98        false
99    }
100
101    fn token(&self) -> &Token {
102        &self.token
103    }
104
105    fn token_mut(&mut self) -> &mut Token {
106        &mut self.token
107    }
108}
109
110#[cfg(test)]
111pub mod tests {
112    use tantivy::tokenizer::{TextAnalyzer, Token, TokenizerManager};
113
114    use super::DictTokenizer;
115
116    pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
117        assert_eq!(token.position, position, "expected position {} but {:?}", position, token);
118        assert_eq!(token.text, text, "expected text {} but {:?}", text, token);
119        assert_eq!(token.offset_from, from, "expected offset_from {} but {:?}", from, token);
120        assert_eq!(token.offset_to, to, "expected offset_to {} but {:?}", to, token);
121    }
122
123    #[test]
124    fn test_dict_tokenizer() {
125        let tokenizer_manager = TokenizerManager::default();
126        tokenizer_manager.register("tokenizer", TextAnalyzer::builder(DictTokenizer::new()).build());
127        let mut tokenizer = tokenizer_manager.get("tokenizer").unwrap();
128        let mut tokens: Vec<Token> = vec![];
129        {
130            let mut add_token = |token: &Token| {
131                tokens.push(token.clone());
132            };
133            tokenizer
134                .token_stream("FOXP2 gene (not FOXP21) can be correlated with autism spectrum disorder or just autismo")
135                .process(&mut add_token);
136        }
137
138        assert_eq!(tokens.len(), 1);
139        assert_token(&tokens[0], 0, "foxp2", 0, 5);
140
141        let mut tokens: Vec<Token> = vec![];
142        {
143            let mut add_token = |token: &Token| {
144                tokens.push(token.clone());
145            };
146            tokenizer.token_stream("FOXP2ген связан с аутизмом").process(&mut add_token);
147        }
148
149        assert_eq!(tokens.len(), 0);
150    }
151}