summa_core/components/tokenizers/
dict_tokenizer.rs1use aho_corasick::MatchKind;
2use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
3
4#[derive(Clone)]
6pub struct DictTokenizer {
7 ac: aho_corasick::AhoCorasick,
8 words: Vec<String>,
9 dict: Vec<usize>,
10}
11
12impl DictTokenizer {
13 pub fn new() -> DictTokenizer {
14 let mut synsets = vec![];
15 let mut csv_reader = csv::ReaderBuilder::new()
16 .has_headers(false)
17 .flexible(true)
18 .from_reader(include_bytes!("../../../resources/drugs.csv").as_slice());
19 for record in csv_reader.records() {
20 let mut synset = vec![];
21 for word in record.expect("dictionary is broken").iter() {
22 synset.push(word.to_string())
23 }
24 synsets.push(synset);
25 }
26
27 let mut base_offset = 0;
28 let mut dict = vec![];
29 let words: Vec<String> = synsets
30 .into_iter()
31 .flat_map(|synset| {
32 dict.extend(std::iter::repeat(base_offset).take(synset.len()));
33 base_offset += synset.len();
34 synset
35 })
36 .collect();
37 let ac = aho_corasick::AhoCorasickBuilder::new()
38 .ascii_case_insensitive(true)
39 .match_kind(MatchKind::LeftmostLongest)
40 .build(words.iter())
41 .expect("internal error");
42 DictTokenizer { ac, words, dict }
43 }
44}
45
46impl Default for DictTokenizer {
47 fn default() -> Self {
48 DictTokenizer::new()
49 }
50}
51
52pub struct DictTokenStream<'a> {
53 text: &'a str,
54 words: &'a Vec<String>,
55 dict: &'a Vec<usize>,
56 ah_iter: aho_corasick::FindIter<'a, 'a>,
57 token: Token,
58}
59
60impl<'a> DictTokenStream<'a> {
61 pub fn new(text: &'a str, words: &'a Vec<String>, dict: &'a Vec<usize>, ac: &'a aho_corasick::AhoCorasick) -> DictTokenStream<'a> {
62 DictTokenStream {
63 text,
64 words,
65 dict,
66 ah_iter: ac.find_iter(text),
67 token: Token::default(),
68 }
69 }
70}
71
72impl Tokenizer for DictTokenizer {
73 type TokenStream<'a> = DictTokenStream<'a>;
74
75 fn token_stream<'a>(&'a mut self, text: &'a str) -> DictTokenStream<'a> {
76 DictTokenStream::new(text, &self.words, &self.dict, &self.ac)
77 }
78}
79
80impl<'a> TokenStream for DictTokenStream<'a> {
81 fn advance(&mut self) -> bool {
82 self.token.text.clear();
83 self.token.position = self.token.position.wrapping_add(1);
84 for pattern in self.ah_iter.by_ref() {
85 let properly_beginning = pattern.start() == 0
86 || self.text.as_bytes()[pattern.start() - 1].is_ascii_punctuation()
87 || self.text.as_bytes()[pattern.start() - 1].is_ascii_whitespace();
88 let properly_ending = pattern.end() == self.text.len()
89 || self.text.as_bytes()[pattern.end()].is_ascii_punctuation()
90 || self.text.as_bytes()[pattern.end()].is_ascii_whitespace();
91 if properly_beginning && properly_ending {
92 self.token.offset_from = pattern.start();
93 self.token.offset_to = pattern.end();
94 self.token.text.push_str(&self.words[self.dict[pattern.pattern()]]);
95 return true;
96 }
97 }
98 false
99 }
100
101 fn token(&self) -> &Token {
102 &self.token
103 }
104
105 fn token_mut(&mut self) -> &mut Token {
106 &mut self.token
107 }
108}
109
110#[cfg(test)]
111pub mod tests {
112 use tantivy::tokenizer::{TextAnalyzer, Token, TokenizerManager};
113
114 use super::DictTokenizer;
115
116 pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
117 assert_eq!(token.position, position, "expected position {} but {:?}", position, token);
118 assert_eq!(token.text, text, "expected text {} but {:?}", text, token);
119 assert_eq!(token.offset_from, from, "expected offset_from {} but {:?}", from, token);
120 assert_eq!(token.offset_to, to, "expected offset_to {} but {:?}", to, token);
121 }
122
123 #[test]
124 fn test_dict_tokenizer() {
125 let tokenizer_manager = TokenizerManager::default();
126 tokenizer_manager.register("tokenizer", TextAnalyzer::builder(DictTokenizer::new()).build());
127 let mut tokenizer = tokenizer_manager.get("tokenizer").unwrap();
128 let mut tokens: Vec<Token> = vec![];
129 {
130 let mut add_token = |token: &Token| {
131 tokens.push(token.clone());
132 };
133 tokenizer
134 .token_stream("FOXP2 gene (not FOXP21) can be correlated with autism spectrum disorder or just autismo")
135 .process(&mut add_token);
136 }
137
138 assert_eq!(tokens.len(), 1);
139 assert_token(&tokens[0], 0, "foxp2", 0, 5);
140
141 let mut tokens: Vec<Token> = vec![];
142 {
143 let mut add_token = |token: &Token| {
144 tokens.push(token.clone());
145 };
146 tokenizer.token_stream("FOXP2ген связан с аутизмом").process(&mut add_token);
147 }
148
149 assert_eq!(tokens.len(), 0);
150 }
151}