alpino_tokenizer/
preproc.rs1use std::borrow::Cow;
2
3use lazy_static::lazy_static;
4use regex::Regex;
5
6fn add_enumeration_markers(text: &str) -> Cow<str> {
14 lazy_static! {
15 static ref RE: Regex = Regex::new("(\\s?1)[.](\\s.*?\\W2[.])").unwrap();
16 }
17
18 let mut text = RE.replace_all(text, "$1#$2");
19
20 if let text @ Cow::Borrowed(_) = text {
21 return text;
22 }
23
24 let mut prev = 1;
25 let mut next = 2;
26
27 loop {
28 let next_expr = Regex::new(&format!("({}#\\s.*?\\W{})[.](\\s)", prev, next))
29 .expect("Invalid enumeration expression.");
30 let text_after = next_expr.replace_all(&text, "$1#$2");
31
32 if let Cow::Borrowed(_) = text_after {
33 break;
34 }
35
36 text = Cow::Owned(text_after.into_owned());
37 prev += 1;
38 next += 1;
39 }
40
41 text
42}
43
44pub fn preprocess(text: &str) -> Cow<str> {
45 add_enumeration_markers(text)
46}
47
48#[cfg(test)]
49mod tests {
50 use super::preprocess;
51
52 #[test]
53 fn add_enumeration_markers() {
54 assert_eq!(
55 preprocess("1. boter, 2. kaas en 3. eieren"),
56 "1# boter, 2# kaas en 3# eieren"
57 );
58
59 assert_eq!(
60 preprocess("1. boter, 2. kaas en 3. eieren, 1. foo en 2. bar"),
61 "1# boter, 2# kaas en 3# eieren, 1# foo en 2# bar"
62 );
63 }
64}