1use crate::tokenizers::extractors::TokenExtractor;
2use crate::tokenizers::models::{Token, Tokens};
3use crate::EyeciteError;
4use std::collections::HashMap;
5
6pub mod extractors;
7pub mod models;
8
9pub trait Tokenizer<'a> {
10 fn get_extractors(&'a self, text: &'a str)
11 -> Box<dyn Iterator<Item = &'a TokenExtractor> + 'a>;
12
13 fn extract_tokens(&'a self, text: &'a str) -> Vec<Token<'a>> {
14 self.get_extractors(text)
15 .flat_map(|e| e.get_matches(text).into_iter().map(move |m| (e, m)))
16 .map(|(e, m)| e.get_token(m))
17 .collect()
18 }
19
20 fn tokenize(&'a self, text: &'a str) -> (Tokens<'a>, Vec<(usize, Token<'a>)>) {
21 let mut citation_tokens: Vec<(usize, Token)> = Vec::new();
22 let mut all_tokens: Vec<Token> = Vec::new();
23
24 let tokens = self.extract_tokens(text);
25 let mut last_token: Option<Token> = None;
26 let mut offset: usize = 0;
27
28 fn append_text<'a>(tokens: &mut Vec<Token<'a>>, text: &'a str) {
32 for part in text.split(' ') {
33 if !part.is_empty() {
35 tokens.push(Token::Word(part));
36 tokens.push(Token::Space);
37 } else {
38 tokens.push(Token::Space);
39 }
40 }
41
42 tokens.pop(); }
44
45 for token in tokens {
46 if let Some(last) = last_token.as_mut() {
47 let merged = last.merge(&token);
51 if let Some(merged) = merged {
52 citation_tokens.pop();
53 all_tokens.pop();
54
55 citation_tokens.push((all_tokens.len(), merged.clone()));
56 all_tokens.push(merged);
57
58 continue;
59 }
60 }
61
62 if offset > token.start() {
63 continue;
64 }
65
66 if offset < token.start() {
67 append_text(&mut all_tokens, &text[offset..token.start()]);
69 }
70
71 citation_tokens.push((all_tokens.len(), token.clone()));
73 all_tokens.push(token.clone());
74 offset = token.end();
75 last_token = Some(token)
76 }
77
78 if offset < text.len() {
80 append_text(&mut all_tokens, &text[offset..]);
81 }
82
83 (all_tokens, citation_tokens)
84 }
85}
86
87pub struct Ahocorasick<'a> {
88 extractors: HashMap<String, Vec<&'a TokenExtractor>>,
89 strings: Vec<String>,
90 corasick: daachorse::DoubleArrayAhoCorasick,
91}
92
93impl<'a> Ahocorasick<'a> {
94 pub fn new(items: &'a [TokenExtractor]) -> Result<Self, EyeciteError> {
95 let mut extractors: HashMap<String, Vec<_>> = HashMap::new();
96
97 for e in items {
98 for s in e.strings.iter().cloned() {
99 let _v = extractors.entry(s).or_default().push(e);
100 }
101 }
102
103 let strings: Vec<_> = extractors.keys().cloned().collect();
104
105 let corasick = daachorse::DoubleArrayAhoCorasickBuilder::new().build(strings.as_slice())?;
106
107 Ok(Self {
108 extractors,
109 strings,
110 corasick,
111 })
112 }
113}
114
115impl<'a> Tokenizer<'a> for Ahocorasick<'a> {
116 fn get_extractors(
117 &'a self,
118 text: &'a str,
119 ) -> Box<dyn Iterator<Item = &'a TokenExtractor> + 'a> {
120 Box::new(self.corasick.find_iter(text).flat_map(|m| {
121 self.extractors[self.strings[m.value()].as_str()]
122 .as_slice()
123 .iter()
124 .copied()
125 }))
126 }
127}
128
129#[cfg(test)]
130mod tests {
131 use super::extractors::EXTRACTORS;
132 use crate::tokenizers::extractors::TokenExtractorExtra;
133 use crate::tokenizers::models::{Token, TokenData};
134 use crate::tokenizers::{Ahocorasick, Tokenizer};
135 use reporters_db::laws::NaiveDateTime;
136 use reporters_db::reporters::Edition;
137 use std::str::FromStr;
138
139 #[test]
140 fn tokenize() {
141 let tokenizer = Ahocorasick::new(EXTRACTORS.as_slice()).unwrap();
142
143 let (all_tokens, tokens) = tokenizer.tokenize("See Roe v. Wade, 410 U. S. 113 (1973)");
144
145 let stop_word_extra = TokenExtractorExtra {
146 exact_editions: vec![],
147 variation_editions: vec![],
148 short: false,
149 };
150
151 let edition_extra = TokenExtractorExtra {
152 exact_editions: vec![],
153 variation_editions: vec![Edition {
154 end: None,
155 start: Some(NaiveDateTime::from_str("1875-01-01T00:00:00").unwrap()),
156 regexes: None,
157 }],
158 short: false,
159 };
160
161 let see_token = Token::StopWord(TokenData {
162 data: "See",
163 start: 0,
164 end: 3,
165 extra: &stop_word_extra,
166 groups: vec![("stop_word".into(), "See")].into_iter().collect(),
167 });
168
169 let v_token = Token::StopWord(TokenData {
170 data: "v.",
171 start: 8,
172 end: 10,
173 extra: &stop_word_extra,
174 groups: vec![("stop_word".into(), "v")].into_iter().collect(),
175 });
176
177 let us_citation = Token::Citation(TokenData {
178 data: "410 U. S. 113",
179 start: 17,
180 end: 30,
181 extra: &edition_extra,
182 groups: vec![
183 ("reporter".into(), "U. S."),
184 ("volume".into(), "410"),
185 ("page".into(), "113"),
186 ]
187 .into_iter()
188 .collect(),
189 });
190
191 let expected_tokens = vec![
192 see_token.clone(),
193 Token::Space,
194 Token::Word("Roe"),
195 Token::Space,
196 v_token.clone(),
197 Token::Space,
198 Token::Word("Wade,"),
199 Token::Space,
200 us_citation.clone(),
201 Token::Space,
202 Token::Word("(1973)"),
203 ];
204
205 assert_eq!(all_tokens, expected_tokens);
206 assert_eq!(tokens, vec![(0, see_token), (4, v_token), (8, us_citation)]);
207 }
208}