1use crate::regexes;
2use crate::tokenizers::models::{Token, TokenData, TokenFactories, TokenFactory};
3use lazy_static::lazy_static;
4use reporters_db::regexes::{RegexTemplate, ResolvedRegex};
5use reporters_db::reporters::{reporters, Edition, EditionName};
6use reporters_db::utils::process_variables;
7use std::collections::{HashMap, HashSet};
8
9#[derive(Default, Debug, Clone, Eq, PartialEq)]
10pub struct TokenExtractorExtra {
11 pub exact_editions: Vec<Edition>,
12 pub variation_editions: Vec<Edition>,
13 pub short: bool,
14}
15
16pub struct TokenMatch<'a> {
17 pub(crate) regex_match: regex::Captures<'a>,
18 pub(crate) names: Vec<&'a str>,
19}
20
21#[derive(Debug)]
22pub struct TokenExtractor {
23 pub regex: ResolvedRegex,
24 pub token_factory: TokenFactories,
25 pub extra: TokenExtractorExtra,
26 pub strings: HashSet<String>,
27 pub ignore_case: bool,
28 built_regex: regex::Regex,
29}
30
31impl TokenExtractor {
32 pub fn new(
33 regex: ResolvedRegex,
34 token_factory: TokenFactories,
35 ignore_case: bool,
36 strings: HashSet<String>,
37 extra: TokenExtractorExtra,
38 ) -> Self {
39 let built_regex = regex::RegexBuilder::new(regex.value())
40 .case_insensitive(ignore_case)
41 .build()
42 .expect("unable to build regex");
43
44 Self {
45 regex,
46 token_factory,
47 built_regex,
48 ignore_case,
49 strings,
50 extra,
51 }
52 }
53
54 pub fn get_matches<'a>(&'a self, text: &'a str) -> Vec<TokenMatch<'a>> {
56 let matches = self.built_regex.captures_iter(text);
57 let names: Vec<_> = self.built_regex.capture_names().flatten().collect();
58
59 matches
60 .into_iter()
61 .map(|regex_match| TokenMatch {
62 regex_match,
63 names: names.clone(),
64 })
65 .collect()
66 }
67
68 pub fn get_token<'a>(&'a self, token_match: TokenMatch<'a>) -> Token<'a> {
70 let m = token_match.regex_match.get(1).unwrap();
71 let start = m.start();
72 let end = m.end();
73 let data: &'a str = m.as_str();
74
75 let extra: &'a TokenExtractorExtra = &self.extra;
76
77 self.token_factory.create(TokenData {
78 start,
79 end,
80 data,
81 extra,
82 groups: token_match
83 .names
84 .into_iter()
85 .flat_map(|name| {
86 token_match
87 .regex_match
88 .name(name)
89 .map(move |m| (name, m.as_str()))
90 })
91 .collect(),
92 })
93 }
94}
95
96pub fn _populate_reporter_extractors() -> Vec<TokenExtractor> {
97 let mut raw_regex_variables = reporters_db::regexes::raw_regexes();
98
99 raw_regex_variables
100 .get_mut("full_cite")
101 .expect("full_cite should already exist")
102 .add("", RegexTemplate::of("$volume $reporter,? $page"));
103
104 raw_regex_variables
105 .get_mut("page")
106 .expect("page should already exist")
107 .add("", RegexTemplate::of(regexes::PAGE_REGEX));
108
109 let regex_vars = process_variables(raw_regex_variables);
110
111 fn _substitute_edition(template: RegexTemplate, edition_name: &[EditionName]) -> RegexTemplate {
112 let mut map: HashMap<String, RegexTemplate> = HashMap::new();
113 let editions: Vec<String> = edition_name
114 .iter()
115 .map(|e| e.value())
116 .map(regex::escape)
117 .collect();
118 map.insert("edition".into(), RegexTemplate::of(editions.join("|")));
119 template.resolve(&map)
120 }
121
122 #[derive(Default, Debug)]
129 struct Lookup {
130 editions: Vec<Edition>,
131 variations: Vec<Edition>,
132 strings: HashSet<String>,
133 short: bool,
134 }
135
136 fn _add_regex(
137 reporters: &[EditionName],
138 edition: &Edition,
139 regex: ResolvedRegex,
140 is_short: bool,
141 result: &mut HashMap<ResolvedRegex, Lookup>,
142 func: fn(&mut Lookup) -> &mut Vec<Edition>,
143 ) {
144 let entry = result.entry(regex.clone()).or_default();
145
146 entry.short = is_short;
147
148 let result = func(entry);
149 result.push(edition.clone());
150
151 let has_strings = regex.value().contains(®ex::escape(reporters[0].value()));
152
153 if has_strings {
154 let cloned = reporters.iter().map(|r| r.value().into());
155
156 for s in cloned {
157 entry.strings.insert(s);
158 }
159 }
160 }
161
162 fn _add_regexes(
163 regex_templates: &[RegexTemplate],
164 edition_name: EditionName,
165 edition: Edition,
166 variations: Vec<EditionName>,
167 variables: &HashMap<String, RegexTemplate>,
168 result: &mut HashMap<ResolvedRegex, Lookup>,
169 ) {
170 for template in regex_templates {
171 let template = reporters_db::utils::recursive_substitute(template.clone(), variables);
172 let arg = vec![edition_name.clone()];
173 let regex = _substitute_edition(template.clone(), arg.as_slice())
174 .resolved()
175 .expect("edition should have been the last thing to resolve");
176
177 let short_regex = regexes::short_cite_re(regex.value());
178 _add_regex(arg.as_slice(), &edition, regex, false, result, |l| {
179 &mut l.editions
180 });
181 _add_regex(arg.as_slice(), &edition, short_regex, true, result, |l| {
182 &mut l.editions
183 });
184
185 if !variations.is_empty() {
186 let variation_regex = _substitute_edition(template, variations.as_slice())
187 .resolved()
188 .expect("edition should have been the last thing to resolve");
189
190 let short_variation_regex = regexes::short_cite_re(variation_regex.value());
191
192 _add_regex(
193 variations.as_slice(),
194 &edition,
195 variation_regex,
196 false,
197 result,
198 |l| &mut l.variations,
199 );
200 _add_regex(
201 variations.as_slice(),
202 &edition,
203 short_variation_regex,
204 false,
205 result,
206 |l| &mut l.variations,
207 );
208 }
209 }
210 }
211
212 let mut editions_by_regex: HashMap<ResolvedRegex, Lookup> = HashMap::new();
213
214 let reporters = reporters();
216 for (_key, cluster) in reporters {
217 for source in cluster {
218 let variations = source.variations;
219
220 for (edition_name, edition_data) in source.editions {
221 let regexes = edition_data
222 .regexes
223 .clone()
224 .unwrap_or_else(|| vec![RegexTemplate::of("$full_cite")]);
225
226 let edition_variations: Vec<_> = variations
227 .iter()
228 .filter(|(_, v)| edition_name == (*v).clone())
229 .map(|(k, _)| k.clone())
230 .collect();
231
232 _add_regexes(
233 ®exes,
234 edition_name,
235 edition_data,
236 edition_variations,
237 ®ex_vars,
238 &mut editions_by_regex,
239 )
240 }
241 }
242 }
243
244 let mut extractors = Vec::new();
249
250 for (regex, lookup) in editions_by_regex {
252 extractors.push(TokenExtractor::new(
253 regexes::nonalphanum_boundaries_re(®ex),
254 TokenFactories::Citation,
255 false,
256 lookup.strings,
257 TokenExtractorExtra {
258 exact_editions: lookup.editions,
259 variation_editions: lookup.variations,
260 short: lookup.short,
261 },
262 ));
263 }
264
265 extractors.push(TokenExtractor::new(
266 ResolvedRegex::of(regexes::ID_REGEX.into()),
267 TokenFactories::Id,
268 true,
269 vec!["id.".into(), "ibid.".into()].into_iter().collect(),
270 Default::default(),
271 ));
272
273 extractors.push(TokenExtractor::new(
274 ResolvedRegex::of(regexes::SUPRA_REGEX.into()),
275 TokenFactories::Supra,
276 true,
277 vec!["supra".into()].into_iter().collect(),
278 Default::default(),
279 ));
280
281 extractors.push(TokenExtractor::new(
282 ResolvedRegex::of(regexes::PARAGRAPH_REGEX.into()),
283 TokenFactories::Paragraph,
284 false,
285 Default::default(),
286 Default::default(),
287 ));
288
289 extractors.push(TokenExtractor::new(
290 ResolvedRegex::of(regexes::STOP_WORD_REGEX.into()),
291 TokenFactories::StopWord,
292 true,
293 regexes::STOP_WORDS.into_iter().map(|s| s.into()).collect(),
294 Default::default(),
295 ));
296
297 extractors.push(TokenExtractor::new(
298 ResolvedRegex::of(regexes::SECTION_REGEX.into()),
299 TokenFactories::Section,
300 false,
301 vec!["ยง"].into_iter().map(|s| s.into()).collect(),
302 Default::default(),
303 ));
304
305 extractors
306}
307
308lazy_static! {
309 pub static ref EXTRACTORS: Vec<TokenExtractor> = _populate_reporter_extractors();
310}
311
312#[cfg(test)]
313mod tests {
314 use super::EXTRACTORS;
315
316 #[test]
317 fn build_extractors() {
318 assert_eq!(EXTRACTORS.is_empty(), false);
319 }
320}