alpino_tokenizer/
preproc.rs

1use std::borrow::Cow;
2
3use lazy_static::lazy_static;
4use regex::Regex;
5
6// This function rewrites enumerations of the form
7//
8// 1. foo, 2. bar en 3. baz
9//
10// to
11//
12// 1# foo, 2# bar en 3# baz
13fn add_enumeration_markers(text: &str) -> Cow<str> {
14    lazy_static! {
15        static ref RE: Regex = Regex::new("(\\s?1)[.](\\s.*?\\W2[.])").unwrap();
16    }
17
18    let mut text = RE.replace_all(text, "$1#$2");
19
20    if let text @ Cow::Borrowed(_) = text {
21        return text;
22    }
23
24    let mut prev = 1;
25    let mut next = 2;
26
27    loop {
28        let next_expr = Regex::new(&format!("({}#\\s.*?\\W{})[.](\\s)", prev, next))
29            .expect("Invalid enumeration expression.");
30        let text_after = next_expr.replace_all(&text, "$1#$2");
31
32        if let Cow::Borrowed(_) = text_after {
33            break;
34        }
35
36        text = Cow::Owned(text_after.into_owned());
37        prev += 1;
38        next += 1;
39    }
40
41    text
42}
43
44pub fn preprocess(text: &str) -> Cow<str> {
45    add_enumeration_markers(text)
46}
47
48#[cfg(test)]
49mod tests {
50    use super::preprocess;
51
52    #[test]
53    fn add_enumeration_markers() {
54        assert_eq!(
55            preprocess("1. boter, 2. kaas en 3. eieren"),
56            "1# boter, 2# kaas en 3# eieren"
57        );
58
59        assert_eq!(
60            preprocess("1. boter, 2. kaas en 3. eieren, 1. foo en 2. bar"),
61            "1# boter, 2# kaas en 3# eieren, 1# foo en 2# bar"
62        );
63    }
64}