eyecite/
tokenizers.rs

1use crate::tokenizers::extractors::TokenExtractor;
2use crate::tokenizers::models::{Token, Tokens};
3use crate::EyeciteError;
4use std::collections::HashMap;
5
6pub mod extractors;
7pub mod models;
8
9pub trait Tokenizer<'a> {
10    fn get_extractors(&'a self, text: &'a str)
11        -> Box<dyn Iterator<Item = &'a TokenExtractor> + 'a>;
12
13    fn extract_tokens(&'a self, text: &'a str) -> Vec<Token<'a>> {
14        self.get_extractors(text)
15            .flat_map(|e| e.get_matches(text).into_iter().map(move |m| (e, m)))
16            .map(|(e, m)| e.get_token(m))
17            .collect()
18    }
19
20    fn tokenize(&'a self, text: &'a str) -> (Tokens<'a>, Vec<(usize, Token<'a>)>) {
21        let mut citation_tokens: Vec<(usize, Token)> = Vec::new();
22        let mut all_tokens: Vec<Token> = Vec::new();
23
24        let tokens = self.extract_tokens(text);
25        let mut last_token: Option<Token> = None;
26        let mut offset: usize = 0;
27
28        /// Split text into words, treating whitespace as a word, and append
29        /// to tokens. NOTE this is a significant portion of total runtime of
30        /// get_citations(), so benchmark if changing
31        fn append_text<'a>(tokens: &mut Vec<Token<'a>>, text: &'a str) {
32            for part in text.split(' ') {
33                // TODO: maybe filter repeated strings which will be empty
34                if !part.is_empty() {
35                    tokens.push(Token::Word(part));
36                    tokens.push(Token::Space);
37                } else {
38                    tokens.push(Token::Space);
39                }
40            }
41
42            tokens.pop(); // remove final extra space
43        }
44
45        for token in tokens {
46            if let Some(last) = last_token.as_mut() {
47                // Sometimes the exact same cite is matched by two different
48                // regexes. Attempt to merge rather than discarding one or the
49                // other:
50                let merged = last.merge(&token);
51                if let Some(merged) = merged {
52                    citation_tokens.pop();
53                    all_tokens.pop();
54
55                    citation_tokens.push((all_tokens.len(), merged.clone()));
56                    all_tokens.push(merged);
57
58                    continue;
59                }
60            }
61
62            if offset > token.start() {
63                continue;
64            }
65
66            if offset < token.start() {
67                // capture plain text before each match
68                append_text(&mut all_tokens, &text[offset..token.start()]);
69            }
70
71            // capture match
72            citation_tokens.push((all_tokens.len(), token.clone()));
73            all_tokens.push(token.clone());
74            offset = token.end();
75            last_token = Some(token)
76        }
77
78        // capture plain text after final match
79        if offset < text.len() {
80            append_text(&mut all_tokens, &text[offset..]);
81        }
82
83        (all_tokens, citation_tokens)
84    }
85}
86
87pub struct Ahocorasick<'a> {
88    extractors: HashMap<String, Vec<&'a TokenExtractor>>,
89    strings: Vec<String>,
90    corasick: daachorse::DoubleArrayAhoCorasick,
91}
92
93impl<'a> Ahocorasick<'a> {
94    pub fn new(items: &'a [TokenExtractor]) -> Result<Self, EyeciteError> {
95        let mut extractors: HashMap<String, Vec<_>> = HashMap::new();
96
97        for e in items {
98            for s in e.strings.iter().cloned() {
99                let _v = extractors.entry(s).or_default().push(e);
100            }
101        }
102
103        let strings: Vec<_> = extractors.keys().cloned().collect();
104
105        let corasick = daachorse::DoubleArrayAhoCorasickBuilder::new().build(strings.as_slice())?;
106
107        Ok(Self {
108            extractors,
109            strings,
110            corasick,
111        })
112    }
113}
114
115impl<'a> Tokenizer<'a> for Ahocorasick<'a> {
116    fn get_extractors(
117        &'a self,
118        text: &'a str,
119    ) -> Box<dyn Iterator<Item = &'a TokenExtractor> + 'a> {
120        Box::new(self.corasick.find_iter(text).flat_map(|m| {
121            self.extractors[self.strings[m.value()].as_str()]
122                .as_slice()
123                .iter()
124                .copied()
125        }))
126    }
127}
128
129#[cfg(test)]
130mod tests {
131    use super::extractors::EXTRACTORS;
132    use crate::tokenizers::extractors::TokenExtractorExtra;
133    use crate::tokenizers::models::{Token, TokenData};
134    use crate::tokenizers::{Ahocorasick, Tokenizer};
135    use reporters_db::laws::NaiveDateTime;
136    use reporters_db::reporters::Edition;
137    use std::str::FromStr;
138
139    #[test]
140    fn tokenize() {
141        let tokenizer = Ahocorasick::new(EXTRACTORS.as_slice()).unwrap();
142
143        let (all_tokens, tokens) = tokenizer.tokenize("See Roe v. Wade, 410 U. S. 113 (1973)");
144
145        let stop_word_extra = TokenExtractorExtra {
146            exact_editions: vec![],
147            variation_editions: vec![],
148            short: false,
149        };
150
151        let edition_extra = TokenExtractorExtra {
152            exact_editions: vec![],
153            variation_editions: vec![Edition {
154                end: None,
155                start: Some(NaiveDateTime::from_str("1875-01-01T00:00:00").unwrap()),
156                regexes: None,
157            }],
158            short: false,
159        };
160
161        let see_token = Token::StopWord(TokenData {
162            data: "See",
163            start: 0,
164            end: 3,
165            extra: &stop_word_extra,
166            groups: vec![("stop_word".into(), "See")].into_iter().collect(),
167        });
168
169        let v_token = Token::StopWord(TokenData {
170            data: "v.",
171            start: 8,
172            end: 10,
173            extra: &stop_word_extra,
174            groups: vec![("stop_word".into(), "v")].into_iter().collect(),
175        });
176
177        let us_citation = Token::Citation(TokenData {
178            data: "410 U. S. 113",
179            start: 17,
180            end: 30,
181            extra: &edition_extra,
182            groups: vec![
183                ("reporter".into(), "U. S."),
184                ("volume".into(), "410"),
185                ("page".into(), "113"),
186            ]
187            .into_iter()
188            .collect(),
189        });
190
191        let expected_tokens = vec![
192            see_token.clone(),
193            Token::Space,
194            Token::Word("Roe"),
195            Token::Space,
196            v_token.clone(),
197            Token::Space,
198            Token::Word("Wade,"),
199            Token::Space,
200            us_citation.clone(),
201            Token::Space,
202            Token::Word("(1973)"),
203        ];
204
205        assert_eq!(all_tokens, expected_tokens);
206        assert_eq!(tokens, vec![(0, see_token), (4, v_token), (8, us_citation)]);
207    }
208}