use std::borrow::Cow;
use std::collections::HashMap;
use std::error::Error;
use onig::{Captures, Regex};
use crate::rule::Rule;
use crate::util::{re, re_i};
pub struct ListItemReplacer {
roman_numerals: HashMap<&'static str, isize>,
latin_numerals: HashMap<&'static str, isize>,
alphabetical_list_with_periods: Regex,
alphabetical_list_with_parens: Regex,
alphabetical_list_letters_and_periods_regex: Regex,
extract_alphabetical_list_letters_regex: Regex,
numbered_list_regex_1: Regex,
numbered_list_regex_2: Regex,
numbered_list_parens_regex: Regex,
find_numbered_list_1: regex::Regex,
find_numbered_list_2: regex::Regex,
space_between_list_items_first_rule: Rule,
space_between_list_items_second_rule: Rule,
find_numbered_list_parens: regex::Regex,
space_between_list_items_third_rule: Rule,
}
const ROMAN_NUMERALS: &[&str] = &[
"i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii", "xiii", "xiv", "x",
"xi", "xii", "xiii", "xv", "xvi", "xvii", "xviii", "xix", "xx",
];
const LATIN_NUMERALS: &[&str] = &[
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s",
"t", "u", "v", "w", "x", "y", "z",
];
impl ListItemReplacer {
pub fn new() -> Result<Self, Box<dyn Error>> {
#[must_use]
fn map_from_list(list: &[&'static str]) -> HashMap<&'static str, isize> {
list.iter()
.enumerate()
.map(|(idx, &s)| (s, idx as isize))
.collect()
}
Ok(ListItemReplacer {
roman_numerals: map_from_list(ROMAN_NUMERALS),
latin_numerals: map_from_list(LATIN_NUMERALS),
alphabetical_list_with_periods: re_i(
r"(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)",
)?,
alphabetical_list_with_parens: re_i(
r"(?<=\()[a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))",
)?,
alphabetical_list_letters_and_periods_regex: re_i(
r"(?<=^)[a-z]\.|(?<=\A)[a-z]\.|(?<=\s)[a-z]\.",
)?,
extract_alphabetical_list_letters_regex: re_i(
r"\([a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))",
)?,
numbered_list_regex_1: re(
r"\s\d{1,2}(?=\.\s)|^\d{1,2}(?=\.\s)|\s\d{1,2}(?=\.\))|^\d{1,2}(?=\.\))|(?<=\s\-)\d{1,2}(?=\.\s)|(?<=^\-)\d{1,2}(?=\.\s)|(?<=\s\⁃)\d{1,2}(?=\.\s)|(?<=^\⁃)\d{1,2}(?=\.\s)|(?<=s\-)\d{1,2}(?=\.\))|(?<=^\-)\d{1,2}(?=\.\))|(?<=\s\⁃)\d{1,2}(?=\.\))|(?<=^\⁃)\d{1,2}(?=\.\))",
)?,
numbered_list_regex_2: re(
r"(?<=\s)\d{1,2}\.(?=\s)|^\d{1,2}\.(?=\s)|(?<=\s)\d{1,2}\.(?=\))|^\d{1,2}\.(?=\))|(?<=\s\-)\d{1,2}\.(?=\s)|(?<=^\-)\d{1,2}\.(?=\s)|(?<=\s\⁃)\d{1,2}\.(?=\s)|(?<=^\⁃)\d{1,2}\.(?=\s)|(?<=\s\-)\d{1,2}\.(?=\))|(?<=^\-)\d{1,2}\.(?=\))|(?<=\s\⁃)\d{1,2}\.(?=\))|(?<=^\⁃)\d{1,2}\.(?=\))",
)?,
numbered_list_parens_regex: re(r"\d{1,2}(?=\)\s)")?,
find_numbered_list_1: regex::Regex::new(r"♨.+\n.+♨|♨.+\r.+♨")?,
find_numbered_list_2: regex::Regex::new(r"for\s\d{1,2}♨\s[a-z]")?,
space_between_list_items_first_rule: Rule::new(r"(?<=\S\S)\s(?=\S\s*\d+♨)", "\r")?,
space_between_list_items_second_rule: Rule::new(r"(?<=\S\S)\s(?=\d{1,2}♨)", "\r")?,
find_numbered_list_parens: regex::Regex::new(r"☝.+\n.+☝|☝.+\r.+☝")?,
space_between_list_items_third_rule: Rule::new(r"(?<=\S\S)\s(?=\d{1,2}☝)", "\r")?,
})
}
#[must_use]
pub fn add_line_break<'a>(&self, text: &'a str) -> String {
let text = Cow::Borrowed(text);
let text = self.iterate_alphabet_array(&text, false, false);
let text = self.iterate_alphabet_array(&text, true, false);
let text = self.iterate_alphabet_array(&text, false, true);
let text = self.iterate_alphabet_array(&text, true, true);
let text = self.scan_lists(
&text,
&self.numbered_list_regex_1,
&self.numbered_list_regex_2,
'♨',
true,
);
let text = self.add_line_breaks_for_numbered_list_with_periods(&text);
let text = text.replace("♨", "∯");
let text = self.scan_lists(
&text,
&self.numbered_list_parens_regex,
&self.numbered_list_parens_regex,
'☝',
false,
);
let text = self.add_line_breaks_for_numbered_list_with_parens(&text);
text.replace("☝", "") }
#[must_use]
fn replace_alphabet_list(&self, text: &str, what_to_replace: &str) -> String {
self.alphabetical_list_letters_and_periods_regex
.replace_all(text, |m: &Captures| {
let mat = m.at(0).unwrap(); let match_wo_period = mat.strip_suffix('.').unwrap_or(mat);
if match_wo_period == what_to_replace {
format!("\r{}∯", match_wo_period)
} else {
mat.to_string()
}
})
}
#[must_use]
fn replace_alphabet_list_parens(&self, text: &str, what_to_replace: &str) -> String {
self.extract_alphabetical_list_letters_regex
.replace_all(text, |m: &Captures| {
let mat = m.at(0).unwrap();
if let Some(match_wo_paren) = mat.strip_prefix('(') {
if match_wo_paren == what_to_replace {
format!("\r&✂&{}", match_wo_paren)
} else {
mat.to_string()
}
} else if mat == what_to_replace {
format!("\r{}", mat)
} else {
mat.to_string()
}
})
}
#[must_use]
fn iterate_alphabet_array<'a>(
&self,
text: &'a str,
parens: bool,
use_roman_numeral: bool,
) -> Cow<'a, str> {
let regex = if parens {
&self.alphabetical_list_with_parens
} else {
&self.alphabetical_list_with_periods
};
let alphabet = if use_roman_numeral {
&self.roman_numerals
} else {
&self.latin_numerals
};
let list_array: Vec<_> = regex
.find_iter(text)
.filter_map(|x| alphabet.get(&text[x.0..x.1]).map(|&v| (&text[x.0..x.1], v)))
.collect();
let len = list_array.len();
let mut result = Cow::Borrowed(text);
for ind in 0..len {
let is_strange = if len <= 1 {
true
} else if ind == len - 1 {
(list_array[len - 2].1 - list_array[len - 1].1).abs() != 1
} else if ind == 0 {
list_array[1].1 - list_array[0].1 != 1
&& (list_array[len - 1].1 - list_array[0].1).abs() != 1
} else {
list_array[ind + 1].1 - list_array[ind].1 != 1
&& (list_array[ind - 1].1 - list_array[ind].1).abs() != 1
};
if is_strange {
continue;
}
let each = list_array[ind].0;
result = Cow::Owned(if parens {
self.replace_alphabet_list_parens(&result, each)
} else {
self.replace_alphabet_list(&result, each)
})
}
result
}
#[must_use]
fn scan_lists<'a>(
&self,
text: &'a str,
regex1: &Regex,
regex2: &Regex,
replacement: char,
strip: bool,
) -> Cow<'a, str> {
let list_array: Vec<i32> = regex1
.find_iter(text)
.map(|r| text[r.0..r.1].trim_start().parse().unwrap())
.collect();
let mut result = Cow::Borrowed(text);
for (i, &each) in list_array.iter().enumerate() {
let i_minus_1 = if i == 0 { None } else { list_array.get(i - 1) };
if !(Some(&(each + 1)) == list_array.get(i + 1)
|| Some(&(each - 1)) == i_minus_1
|| (each == 0 && i_minus_1 == Some(&9))
|| (each == 9 && list_array.get(i + 1) == Some(&0)))
{
continue;
}
result = Cow::Owned(regex2.replace_all(&result, |m: &Captures| {
let mut mat = m.at(0).unwrap();
if strip {
mat = mat.trim();
}
let chomped = if mat.len() == 1 {
mat
} else {
mat.trim_matches(&['.', ']', ')'][..])
};
if each.to_string() == chomped {
format!("{}{}", each, replacement)
} else {
mat.to_string()
}
}))
}
result
}
#[must_use]
fn add_line_breaks_for_numbered_list_with_periods<'a>(&self, text: &'a str) -> Cow<'a, str> {
if text.contains('♨')
&& self.find_numbered_list_1.find(text).is_none()
&& self.find_numbered_list_2.find(text).is_none()
{
let text = self.space_between_list_items_first_rule.replace_all(text);
let text = self.space_between_list_items_second_rule.replace_all(&text);
return Cow::Owned(text);
}
Cow::Borrowed(text)
}
#[must_use]
fn add_line_breaks_for_numbered_list_with_parens<'a>(&self, text: &'a str) -> Cow<'a, str> {
if text.contains('☝') && self.find_numbered_list_parens.find(text).is_none() {
let text = self.space_between_list_items_third_rule.replace_all(text);
return Cow::Owned(text);
}
Cow::Borrowed(text)
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::error::Error;
type TestResult = Result<(), Box<dyn Error>>;
#[test]
fn test_alphabetical_list_with_periods() -> TestResult {
let list = ListItemReplacer::new()?;
let text =
"a. The first item b. The second item c. The third list item D. case insesitive \
E. Don't select the nextF.dont't select this G should be followed by dot";
assert_eq!(
list.alphabetical_list_with_periods
.find_iter(text)
.collect::<Vec<_>>(),
vec![
(0, 1), (18, 19), (37, 38), (60, 61), (79, 80), ]
);
Ok(())
}
#[test]
fn test_alphabetical_list_with_parens() -> TestResult {
let list = ListItemReplacer::new()?;
let text = "\
a) Hello world.
b) Hello world.
c) Hello world.
d) Hello world.
e) Hello world.
f) Hello world.
(i) Hello world.
(ii) Hello world.
(iii) Hello world.
(iv) Hello world.
(v) Hello world.
(vi) Hello world.
";
assert_eq!(
list.alphabetical_list_with_parens
.find_iter(text)
.collect::<Vec<_>>(),
vec![
(0, 1,),
(16, 17,),
(32, 33,),
(48, 49,),
(64, 65,),
(80, 81,),
(98, 99,),
(115, 117,),
(133, 136,),
(152, 154,),
(170, 171,),
(187, 189,),
]
);
Ok(())
}
#[test]
fn test_alphabetical_list_letters_and_periods_regex() -> TestResult {
let list = ListItemReplacer::new()?;
let text = "His name is Mark E. Smith. a. here it is b. another c. one more
They went to the store. It was John A. Smith. She was Jane B. Smith.";
assert_eq!(
list.alphabetical_list_letters_and_periods_regex
.find_iter(text)
.collect::<Vec<_>>(),
vec![
(17, 19), (27, 29), (41, 43), (52, 54), (101, 103), (124, 126), ]
);
Ok(())
}
#[test]
fn test_extract_alphabetical_list_letters_regex() -> TestResult {
let list = ListItemReplacer::new()?;
let text =
"a) here it is b) another c) one more \nThey went to the store. W) hello X) hello Y) hello";
assert_eq!(
list.extract_alphabetical_list_letters_regex
.find_iter(text)
.collect::<Vec<_>>(),
vec![
(0, 1), (14, 15), (25, 26), (62, 63), (71, 72), (80, 81), ]
);
Ok(())
}
#[test]
fn test_numbered_list_regex_1() -> TestResult {
let list = ListItemReplacer::new()?;
let text = "\
Match below
1. abcd
2. xyz
1. as
2. yo
3. asdf
4. asdf
Dont match below
1.abc
2) asdf
333. asdf
";
assert_eq!(
list.numbered_list_regex_1
.find_iter(text)
.collect::<Vec<_>>(),
vec![(12, 14), (21, 23), (33, 35), (43, 45), (49, 51), (58, 60),]
);
Ok(())
}
#[test]
fn test_numbered_list_regex_2() -> TestResult {
let list = ListItemReplacer::new()?;
let text = "\
Match below
1. abcd
2. xyz
1. as
2. yo
3. asdf
4. asdf
Dont match below
1.abc
2) asdf
333. asdf
";
assert_eq!(
list.numbered_list_regex_2
.find_iter(text)
.collect::<Vec<_>>(),
vec![(13, 15), (22, 24), (34, 36), (44, 46), (50, 52), (59, 61),]
);
Ok(())
}
#[test]
fn test_numbered_list_parens_regex() -> TestResult {
let list = ListItemReplacer::new()?;
let text = "\
1) a
2) b
1) b1
2) b2
3) c
4) 5)
55) d
666) e
f77) f
8888) f
10)nomatch
-10) ignore sign
";
assert_eq!(
list.numbered_list_parens_regex
.find_iter(text)
.collect::<Vec<_>>(),
vec![
(0, 1),
(5, 6),
(14, 15),
(24, 25),
(30, 31),
(35, 36),
(38, 39),
(41, 43),
(48, 50),
(55, 57),
(63, 65),
(81, 83),
]
);
Ok(())
}
#[test]
fn test_space_between_list_items_first_rule() -> TestResult {
let list = ListItemReplacer::new()?;
let input = "abcd ⁃9♨ The first item ⁃10♨ The second item ⁃9♨ The first item ⁃10♨ The second item ⁃9♨ The first item ⁃10♨ The second item ⁃9♨ The first item ⁃10♨ The second item ⁃9♨ The first item ⁃10♨ The second item ⁃9♨ The first item ⁃10♨ The second item";
let output = "abcd ⁃9♨ The first item\r⁃10♨ The second item\r⁃9♨ The first item\r⁃10♨ The second item\r⁃9♨ The first item\r⁃10♨ The second item\r⁃9♨ The first item\r⁃10♨ The second item\r⁃9♨ The first item\r⁃10♨ The second item\r⁃9♨ The first item\r⁃10♨ The second item";
assert_eq!(
list.space_between_list_items_first_rule.replace_all(input),
output
);
Ok(())
}
#[test]
fn test_space_between_list_items_second_rule() -> TestResult {
let list = ListItemReplacer::new()?;
let input = "1♨ The first item 2♨ The second item";
let output = "1♨ The first item\r2♨ The second item";
assert_eq!(
list.space_between_list_items_second_rule.replace_all(input),
output
);
Ok(())
}
#[test]
fn test_space_between_list_items_third_rule() -> TestResult {
let list = ListItemReplacer::new()?;
let input = "1☝) The first item 2☝) The second item";
let output = "1☝) The first item\r2☝) The second item";
assert_eq!(
list.space_between_list_items_third_rule.replace_all(input),
output
);
Ok(())
}
#[test]
fn test_replace_alphabet_list() -> TestResult {
let list = ListItemReplacer::new()?;
assert_eq!(
list.replace_alphabet_list("a. ffegnog b. fgegkl c.", "b"),
"a. ffegnog \rb∯ fgegkl c."
);
Ok(())
}
#[test]
fn test_replace_alphabet_list_parens() -> TestResult {
let list = ListItemReplacer::new()?;
assert_eq!(
list.replace_alphabet_list_parens("a) ffegnog (b) fgegkl c)", "a"),
"\ra) ffegnog (b) fgegkl c)"
);
assert_eq!(
list.replace_alphabet_list_parens("a) ffegnog (b) fgegkl c)", "b"),
"a) ffegnog \r&✂&b) fgegkl c)"
);
Ok(())
}
#[test]
fn test_iterate_alphabet_array() -> TestResult {
let list = ListItemReplacer::new()?;
assert_eq!(list.iterate_alphabet_array("i. Hi", false, true), "i. Hi");
let input = "\
Replace
a. Lorem
b. Donec
c. Aenean
Don't
A. Vestibulum
B. Proin
C. Maecenas
";
let output = "\
Replace
\ra∯ Lorem
\rb∯ Donec
\rc∯ Aenean
Don't
A. Vestibulum
B. Proin
C. Maecenas
";
assert_eq!(list.iterate_alphabet_array(input, false, false), output,);
let input = "\
Do
a) Lorem
b) Donec
c) Aenean
(a) Lorem
(b) Donec
(c) Aenean
Don't
A) Vestibulum
B) Proin
C) Maecenas
(A) Vestibulum
(B) Proin
(C) Maecenas
";
let output = "\
Do
\r\ra) Lorem
\r\rb) Donec
\r\rc) Aenean
\r&✂&a) Lorem
\r&✂&b) Donec
\r&✂&c) Aenean
Don't
A) Vestibulum
B) Proin
C) Maecenas
(A) Vestibulum
(B) Proin
(C) Maecenas
";
assert_eq!(list.iterate_alphabet_array(input, true, false), output,);
let input = "\
NOP
i. Ut eu volutpat felis.
ii. Mauris
iii. Proin
I. Suspendisse
II. Maecenas
III. Nam
";
assert_eq!(list.iterate_alphabet_array(input, false, true), input,);
let input = "\
Do
i) Ut eu volutpat felis.
ii) Mauris
iii) Proin
(i) Ut eu volutpat felis.
(ii) Mauris
(iii) Proin
Don't
I) Suspendisse
II) Maecenas
III) Nam
(I) Suspendisse
(II) Maecenas
(III) Nam
";
let output = "\
Do
\r\ri) Ut eu volutpat felis.
\r\rii) Mauris
\r\riii) Proin
\r&✂&i) Ut eu volutpat felis.
\r&✂&ii) Mauris
\r&✂&iii) Proin
Don't
I) Suspendisse
II) Maecenas
III) Nam
(I) Suspendisse
(II) Maecenas
(III) Nam
";
assert_eq!(list.iterate_alphabet_array(input, true, true), output,);
Ok(())
}
#[test]
fn test_scan_lists() -> TestResult {
let list = ListItemReplacer::new()?;
let input = "\
Match below
1. abcd
2. xyz
1. as
2. yo
3. asdf
4. asdf
Dont match below
1.abc
2) asdf
333. asdf
";
let output = "\
Match below
1♨ abcd
2♨ xyz
1♨ as
2♨ yo
3♨ asdf
4♨ asdf
Dont match below
1.abc
2) asdf
333. asdf
";
assert_eq!(
list.scan_lists(
input,
&list.numbered_list_regex_1,
&list.numbered_list_regex_2,
'♨',
true
),
Cow::<str>::Borrowed(output)
);
let input = "\
1) a
2) b
1) b1
2) b2
3) c
4) 5)
55) d
666) e
f77) f
8888) f
10)nomatch
-10) ignore sign
";
let output = "\
1☝) a
2☝) b
1☝) b1
2☝) b2
3☝) c
4☝) 5☝)
55) d
666) e
f77) f
8888) f
10)nomatch
-10) ignore sign
";
assert_eq!(
list.scan_lists(
input,
&list.numbered_list_parens_regex,
&list.numbered_list_parens_regex,
'☝',
false
),
Cow::<str>::Borrowed(output)
);
Ok(())
}
#[test]
fn test_add_line_breaks_for_numbered_list_with_periods() -> TestResult {
let list = ListItemReplacer::new()?;
let input = "1♨ abcd 2♨ xyz 3♨ asdf 4♨ asdf";
let output = "1♨ abcd\r2♨ xyz\r3♨ asdf\r4♨ asdf";
assert_eq!(
list.add_line_breaks_for_numbered_list_with_periods(input),
output
);
Ok(())
}
#[test]
fn test_add_line_breaks_for_numbered_list_with_parens() -> TestResult {
let list = ListItemReplacer::new()?;
let input = "1☝) The first item 2☝) The second item";
let output = "1☝) The first item\r2☝) The second item";
assert_eq!(
list.add_line_breaks_for_numbered_list_with_parens(input),
output
);
Ok(())
}
}