eyecite/tokenizers/
extractors.rs

1use crate::regexes;
2use crate::tokenizers::models::{Token, TokenData, TokenFactories, TokenFactory};
3use lazy_static::lazy_static;
4use reporters_db::regexes::{RegexTemplate, ResolvedRegex};
5use reporters_db::reporters::{reporters, Edition, EditionName};
6use reporters_db::utils::process_variables;
7use std::collections::{HashMap, HashSet};
8
9#[derive(Default, Debug, Clone, Eq, PartialEq)]
10pub struct TokenExtractorExtra {
11    pub exact_editions: Vec<Edition>,
12    pub variation_editions: Vec<Edition>,
13    pub short: bool,
14}
15
16pub struct TokenMatch<'a> {
17    pub(crate) regex_match: regex::Captures<'a>,
18    pub(crate) names: Vec<&'a str>,
19}
20
21#[derive(Debug)]
22pub struct TokenExtractor {
23    pub regex: ResolvedRegex,
24    pub token_factory: TokenFactories,
25    pub extra: TokenExtractorExtra,
26    pub strings: HashSet<String>,
27    pub ignore_case: bool,
28    built_regex: regex::Regex,
29}
30
31impl TokenExtractor {
32    pub fn new(
33        regex: ResolvedRegex,
34        token_factory: TokenFactories,
35        ignore_case: bool,
36        strings: HashSet<String>,
37        extra: TokenExtractorExtra,
38    ) -> Self {
39        let built_regex = regex::RegexBuilder::new(regex.value())
40            .case_insensitive(ignore_case)
41            .build()
42            .expect("unable to build regex");
43
44        Self {
45            regex,
46            token_factory,
47            built_regex,
48            ignore_case,
49            strings,
50            extra,
51        }
52    }
53
54    /// Return match objects for all matches in text.
55    pub fn get_matches<'a>(&'a self, text: &'a str) -> Vec<TokenMatch<'a>> {
56        let matches = self.built_regex.captures_iter(text);
57        let names: Vec<_> = self.built_regex.capture_names().flatten().collect();
58
59        matches
60            .into_iter()
61            .map(|regex_match| TokenMatch {
62                regex_match,
63                names: names.clone(),
64            })
65            .collect()
66    }
67
68    /// For a given match object, return a Token.
69    pub fn get_token<'a>(&'a self, token_match: TokenMatch<'a>) -> Token<'a> {
70        let m = token_match.regex_match.get(1).unwrap();
71        let start = m.start();
72        let end = m.end();
73        let data: &'a str = m.as_str();
74
75        let extra: &'a TokenExtractorExtra = &self.extra;
76
77        self.token_factory.create(TokenData {
78            start,
79            end,
80            data,
81            extra,
82            groups: token_match
83                .names
84                .into_iter()
85                .flat_map(|name| {
86                    token_match
87                        .regex_match
88                        .name(name)
89                        .map(move |m| (name, m.as_str()))
90                })
91                .collect(),
92        })
93    }
94}
95
96pub fn _populate_reporter_extractors() -> Vec<TokenExtractor> {
97    let mut raw_regex_variables = reporters_db::regexes::raw_regexes();
98
99    raw_regex_variables
100        .get_mut("full_cite")
101        .expect("full_cite should already exist")
102        .add("", RegexTemplate::of("$volume $reporter,? $page"));
103
104    raw_regex_variables
105        .get_mut("page")
106        .expect("page should already exist")
107        .add("", RegexTemplate::of(regexes::PAGE_REGEX));
108
109    let regex_vars = process_variables(raw_regex_variables);
110
111    fn _substitute_edition(template: RegexTemplate, edition_name: &[EditionName]) -> RegexTemplate {
112        let mut map: HashMap<String, RegexTemplate> = HashMap::new();
113        let editions: Vec<String> = edition_name
114            .iter()
115            .map(|e| e.value())
116            .map(regex::escape)
117            .collect();
118        map.insert("edition".into(), RegexTemplate::of(editions.join("|")));
119        template.resolve(&map)
120    }
121
122    // # Extractors step one: add an extractor for each reporter string
123    //
124    //     # Build a lookup of regex -> edition.
125    //     # Keys in this dict will be regular expressions to handle a
126    //     # particular reporter string, like (simplified)
127    //     # r"(?P<volume>\d+) (?P<reporter>U\.S\.) (?P<page>\d+)"
128    #[derive(Default, Debug)]
129    struct Lookup {
130        editions: Vec<Edition>,
131        variations: Vec<Edition>,
132        strings: HashSet<String>,
133        short: bool,
134    }
135
136    fn _add_regex(
137        reporters: &[EditionName],
138        edition: &Edition,
139        regex: ResolvedRegex,
140        is_short: bool,
141        result: &mut HashMap<ResolvedRegex, Lookup>,
142        func: fn(&mut Lookup) -> &mut Vec<Edition>,
143    ) {
144        let entry = result.entry(regex.clone()).or_default();
145
146        entry.short = is_short;
147
148        let result = func(entry);
149        result.push(edition.clone());
150
151        let has_strings = regex.value().contains(&regex::escape(reporters[0].value()));
152
153        if has_strings {
154            let cloned = reporters.iter().map(|r| r.value().into());
155
156            for s in cloned {
157                entry.strings.insert(s);
158            }
159        }
160    }
161
162    fn _add_regexes(
163        regex_templates: &[RegexTemplate],
164        edition_name: EditionName,
165        edition: Edition,
166        variations: Vec<EditionName>,
167        variables: &HashMap<String, RegexTemplate>,
168        result: &mut HashMap<ResolvedRegex, Lookup>,
169    ) {
170        for template in regex_templates {
171            let template = reporters_db::utils::recursive_substitute(template.clone(), variables);
172            let arg = vec![edition_name.clone()];
173            let regex = _substitute_edition(template.clone(), arg.as_slice())
174                .resolved()
175                .expect("edition should have been the last thing to resolve");
176
177            let short_regex = regexes::short_cite_re(regex.value());
178            _add_regex(arg.as_slice(), &edition, regex, false, result, |l| {
179                &mut l.editions
180            });
181            _add_regex(arg.as_slice(), &edition, short_regex, true, result, |l| {
182                &mut l.editions
183            });
184
185            if !variations.is_empty() {
186                let variation_regex = _substitute_edition(template, variations.as_slice())
187                    .resolved()
188                    .expect("edition should have been the last thing to resolve");
189
190                let short_variation_regex = regexes::short_cite_re(variation_regex.value());
191
192                _add_regex(
193                    variations.as_slice(),
194                    &edition,
195                    variation_regex,
196                    false,
197                    result,
198                    |l| &mut l.variations,
199                );
200                _add_regex(
201                    variations.as_slice(),
202                    &edition,
203                    short_variation_regex,
204                    false,
205                    result,
206                    |l| &mut l.variations,
207                );
208            }
209        }
210    }
211
212    let mut editions_by_regex: HashMap<ResolvedRegex, Lookup> = HashMap::new();
213
214    // # add reporters.json:
215    let reporters = reporters();
216    for (_key, cluster) in reporters {
217        for source in cluster {
218            let variations = source.variations;
219
220            for (edition_name, edition_data) in source.editions {
221                let regexes = edition_data
222                    .regexes
223                    .clone()
224                    .unwrap_or_else(|| vec![RegexTemplate::of("$full_cite")]);
225
226                let edition_variations: Vec<_> = variations
227                    .iter()
228                    .filter(|(_, v)| edition_name == (*v).clone())
229                    .map(|(k, _)| k.clone())
230                    .collect();
231
232                _add_regexes(
233                    &regexes,
234                    edition_name,
235                    edition_data,
236                    edition_variations,
237                    &regex_vars,
238                    &mut editions_by_regex,
239                )
240            }
241        }
242    }
243
244    // # add laws.json
245
246    // # add journals.json
247
248    let mut extractors = Vec::new();
249
250    // # Add each regex to EXTRACTORS
251    for (regex, lookup) in editions_by_regex {
252        extractors.push(TokenExtractor::new(
253            regexes::nonalphanum_boundaries_re(&regex),
254            TokenFactories::Citation,
255            false,
256            lookup.strings,
257            TokenExtractorExtra {
258                exact_editions: lookup.editions,
259                variation_editions: lookup.variations,
260                short: lookup.short,
261            },
262        ));
263    }
264
265    extractors.push(TokenExtractor::new(
266        ResolvedRegex::of(regexes::ID_REGEX.into()),
267        TokenFactories::Id,
268        true,
269        vec!["id.".into(), "ibid.".into()].into_iter().collect(),
270        Default::default(),
271    ));
272
273    extractors.push(TokenExtractor::new(
274        ResolvedRegex::of(regexes::SUPRA_REGEX.into()),
275        TokenFactories::Supra,
276        true,
277        vec!["supra".into()].into_iter().collect(),
278        Default::default(),
279    ));
280
281    extractors.push(TokenExtractor::new(
282        ResolvedRegex::of(regexes::PARAGRAPH_REGEX.into()),
283        TokenFactories::Paragraph,
284        false,
285        Default::default(),
286        Default::default(),
287    ));
288
289    extractors.push(TokenExtractor::new(
290        ResolvedRegex::of(regexes::STOP_WORD_REGEX.into()),
291        TokenFactories::StopWord,
292        true,
293        regexes::STOP_WORDS.into_iter().map(|s| s.into()).collect(),
294        Default::default(),
295    ));
296
297    extractors.push(TokenExtractor::new(
298        ResolvedRegex::of(regexes::SECTION_REGEX.into()),
299        TokenFactories::Section,
300        false,
301        vec!["§"].into_iter().map(|s| s.into()).collect(),
302        Default::default(),
303    ));
304
305    extractors
306}
307
308lazy_static! {
309    pub static ref EXTRACTORS: Vec<TokenExtractor> = _populate_reporter_extractors();
310}
311
312#[cfg(test)]
313mod tests {
314    use super::EXTRACTORS;
315
316    #[test]
317    fn build_extractors() {
318        assert_eq!(EXTRACTORS.is_empty(), false);
319    }
320}
eyecite/tokenizers/extractors.rs

eyecite/tokenizers/
extractors.rs