use std::collections::BTreeSet;
use deunicode::deunicode;
use once_cell::sync::Lazy;
use regex::Regex;
static RE_TAG: Lazy<Regex> = Lazy::new(|| Regex::new(r"#[a-zA-Z][^\s#]+\s?").unwrap());
static RE_BLOCKS: Lazy<Regex> = Lazy::new(|| Regex::new(r"\n{3,}").unwrap());
#[must_use]
pub fn normalize_whitespace(string: &str) -> String {
string
.lines()
.filter(|&s| !s.is_empty())
.map(str::trim)
.map(ToOwned::to_owned)
.collect::<Vec<_>>()
.join("\n\n")
}
#[must_use]
pub fn extract_tags(string: &str) -> BTreeSet<String> {
let mut tags = RE_TAG
.find_iter(string)
.map(|t| t.as_str())
.map(str::trim)
.map(ToOwned::to_owned)
.collect::<Vec<String>>();
tags.sort();
BTreeSet::from_iter(tags)
}
#[must_use]
pub fn remove_tags(string: &str) -> String {
RE_TAG.replace_all(string, "").trim().to_owned()
}
#[must_use]
pub fn convert_all_to_ascii(string: &str) -> String {
deunicode(string)
}
#[must_use]
pub fn convert_symbols_to_ascii(string: &str) -> String {
let mut string = string.to_owned();
for (from, to) in &*crate::defaults::UNICODE_TO_ASCII_SYMBOLS {
string = string.replace(*from, to);
}
string
}
#[must_use]
pub fn trim_blocks(string: &str) -> String {
let string = RE_BLOCKS.replace_all(string, "\n\n");
let mut string = string.trim_end().to_string();
string.push('\n');
string
}
#[cfg(test)]
mod test_processes {
use super::*;
macro_rules! test_tags {
($($name:ident: ($input:tt, $tags_removed_expected:tt, $tags_expected:tt),)*) => {
$(
#[test]
fn $name() {
let tags_extracted = extract_tags($input);
let tags_expected: BTreeSet<String> = $tags_expected
.into_iter()
.map(|t: &str| t.to_string())
.collect();
let tags_removed = remove_tags($input);
assert_eq!(tags_extracted, tags_expected);
assert_eq!(tags_removed, $tags_removed_expected.to_string());
}
)*
}
}
test_tags! {
test_extract_tags_00: (
"Lorem ipsum.",
"Lorem ipsum.",
[]
),
test_extract_tags_01: (
"Lorem ipsum. #tag01 #tag02",
"Lorem ipsum.",
["#tag01", "#tag02"]
),
test_extract_tags_02: (
"Lorem ipsum. #tag01 #tag02 ",
"Lorem ipsum.",
["#tag01", "#tag02"]
),
test_extract_tags_03: (
"Lorem ipsum. #tag01 #tag02",
"Lorem ipsum.",
["#tag01", "#tag02"]
),
test_extract_tags_04: (
"#tag01 #02 #03",
"#02 #03",
["#tag01"]
),
test_extrt_tags_05: (
"#tag01 #tag01 #tag01",
"",
["#tag01"]
),
}
}