use super::Replacer;
use regex::{Regex, RegexBuilder};
use std::sync::LazyLock;
static LEADING_NONSTANDARD_WHITESPACE: LazyLock<Regex> = LazyLock::new(|| {
RegexBuilder::new("^[\u{00a0}\u{2007}]+")
.multi_line(true)
.build()
.unwrap()
});
static WHITESPACE_ONLY_LINE: LazyLock<Replacer> =
LazyLock::new(|| Replacer::RegexReplace {
regex: RegexBuilder::new(r"^\s+$")
.multi_line(true)
.build()
.unwrap(),
replacement: "",
});
static LEADING_NEWLINES: LazyLock<Replacer> = LazyLock::new(|| Replacer::RegexReplace {
regex: Regex::new(r"^\n+").unwrap(),
replacement: "",
});
static TRAILING_NEWLINES: LazyLock<Replacer> = LazyLock::new(|| Replacer::RegexReplace {
regex: Regex::new(r"\n+$").unwrap(),
replacement: "",
});
static DOS_MAC_NEWLINES: LazyLock<Replacer> = LazyLock::new(|| Replacer::RegexReplace {
regex: Regex::new(r"\r\n?").unwrap(),
replacement: "\n",
});
static CONCAT_LINES: LazyLock<Replacer> = LazyLock::new(|| Replacer::RegexReplace {
regex: Regex::new(r"\\\n").unwrap(),
replacement: "",
});
static TABS: LazyLock<Replacer> = LazyLock::new(|| Replacer::RegexReplace {
regex: Regex::new("\t").unwrap(),
replacement: " ",
});
static NULL_SPACE: LazyLock<Replacer> = LazyLock::new(|| Replacer::RegexReplace {
regex: Regex::new("\0").unwrap(),
replacement: " ",
});
pub fn substitute(text: &mut String) {
let mut buffer = String::new();
macro_rules! replace {
($replacer:expr) => {
$replacer.replace(text, &mut buffer)
};
}
replace!(DOS_MAC_NEWLINES);
replace_leading_spaces(text);
replace!(WHITESPACE_ONLY_LINE);
replace!(CONCAT_LINES);
replace!(TABS);
replace!(NULL_SPACE);
replace!(LEADING_NEWLINES);
replace!(TRAILING_NEWLINES);
}
fn replace_leading_spaces(text: &mut String) {
trace!("Replacing leading non-standard spaces with regular spaces");
let mut offset = 0;
while let Some(capture) = LEADING_NONSTANDARD_WHITESPACE.captures_at(text, offset) {
let mtch = capture
.get(0)
.expect("Regular expression lacks a full match");
let count = mtch.as_str().chars().count();
let spaces = " ".repeat(count);
offset = mtch.start() + count;
text.replace_range(mtch.range(), &spaces);
}
}
#[cfg(test)]
const TEST_CASES: [(&str, &str); 7] = [
(
"\tapple\n\tbanana\tcherry\n",
" apple\n banana cherry",
),
(
"newlines:\r\n* apple\r* banana\r\ncherry\n\r* durian",
"newlines:\n* apple\n* banana\ncherry\n\n* durian",
),
(
"apple\nbanana\n\ncherry\n\n\npineapple\n\n\n\nstrawberry\n\n\n\n\nblueberry\n\n\n\n\n\n",
"apple\nbanana\n\ncherry\n\npineapple\n\nstrawberry\n\nblueberry",
),
(
"apple\rbanana\r\rcherry\r\r\rpineapple\r\r\r\rstrawberry\r\r\r\r\rblueberry\r\r\r\r\r\r",
"apple\nbanana\n\ncherry\n\npineapple\n\nstrawberry\n\nblueberry",
),
(
"concat:\napple banana \\\nCherry\\\nPineapple \\ grape\nblueberry\n",
"concat:\napple banana CherryPineapple \\ grape\nblueberry",
),
("<\n \n \n \n \n>", "<\n\n>"),
("\u{00a0}\u{00a0}\u{2007} apple", " apple"),
];
#[test]
fn regexes() {
let _ = &*LEADING_NONSTANDARD_WHITESPACE;
let _ = &*WHITESPACE_ONLY_LINE;
let _ = &*LEADING_NEWLINES;
let _ = &*TRAILING_NEWLINES;
let _ = &*DOS_MAC_NEWLINES;
let _ = &*CONCAT_LINES;
let _ = &*TABS;
let _ = &*NULL_SPACE;
}
#[test]
fn test_substitute() {
use super::test::test_substitution;
test_substitution("miscellaneous", substitute, &TEST_CASES);
}