use const_format::concatcp;
const LAST_PROCESSED_START_CHAR: char = 'A';
const REPLACE_ORIG_WITH_UNDERSCORE: &str = r#":\/|?~"#;
const REPLACE_ORIG_WITH_SPACE: &str = "<>\"*#%{}^`";
const FILTER_PROCESSED_AFTER_LAST_PROCESSED_WAS_SPACE: &str = " ";
const FILTER_PROCESSED_AFTER_LAST_PROCESSED_WAS_UNDERSCORE: &str = "_";
const FILTER_ORIG_AFTER_LAST_PROCESSED_WAS_WHITESPACE: &str = "_.\\/,;";
const FILTER_ORIG_NON_PRINTING_CHARS: &str = "\u{200B}\u{202A}\u{202B}\u{202C}\
\u{202D}\u{202E}\u{2066}\u{2067}\u{2068}\u{2069}";
pub const TRIM_LINE_CHARS: &str = "_-.,;";
const INSERT_LINE_SEPARATOR: char = '-';
const TRIM_END_LINES: char = INSERT_LINE_SEPARATOR;
#[allow(dead_code)]
pub const ALWAYS_REPLACED_OR_FILTERED_CHARS: &str =
concatcp!(REPLACE_ORIG_WITH_UNDERSCORE, REPLACE_ORIG_WITH_SPACE);
#[allow(dead_code)]
pub const POTENTIALLY_REPLACED_CHARS: &str = concatcp!(
REPLACE_ORIG_WITH_UNDERSCORE,
REPLACE_ORIG_WITH_SPACE,
FILTER_PROCESSED_AFTER_LAST_PROCESSED_WAS_SPACE,
FILTER_PROCESSED_AFTER_LAST_PROCESSED_WAS_UNDERSCORE,
FILTER_ORIG_AFTER_LAST_PROCESSED_WAS_WHITESPACE,
FILTER_ORIG_NON_PRINTING_CHARS,
TRIM_LINE_CHARS,
TRIM_END_LINES
);
pub fn sanitize(s: &str) -> String {
let mut last_processed_chr = LAST_PROCESSED_START_CHAR;
s.lines()
.map(|l| {
let mut s = l
.chars()
.map(|c| if c.is_whitespace() { ' ' } else { c })
.filter(|c| !c.is_control())
.map(|c_orig| {
if REPLACE_ORIG_WITH_UNDERSCORE.find(c_orig).is_some() {
(c_orig, '_')
} else if REPLACE_ORIG_WITH_SPACE.find(c_orig).is_some() {
(c_orig, ' ')
} else {
(c_orig, c_orig)
}
})
.filter(|&(c_orig, c)| {
let discard = (FILTER_PROCESSED_AFTER_LAST_PROCESSED_WAS_SPACE
.find(c)
.is_some()
&& last_processed_chr == ' ')
|| (FILTER_PROCESSED_AFTER_LAST_PROCESSED_WAS_UNDERSCORE
.find(c)
.is_some()
&& last_processed_chr == '_')
|| (FILTER_ORIG_AFTER_LAST_PROCESSED_WAS_WHITESPACE
.find(c_orig)
.is_some()
&& last_processed_chr.is_whitespace())
|| FILTER_ORIG_NON_PRINTING_CHARS.find(c_orig).is_some();
if !discard {
last_processed_chr = c;
};
!discard
})
.map(|(_, c)| c)
.collect::<String>()
.trim_matches(|c: char| c.is_whitespace() || TRIM_LINE_CHARS.find(c).is_some())
.to_string();
s.push(INSERT_LINE_SEPARATOR);
s
})
.collect::<String>()
.trim_end_matches(TRIM_END_LINES)
.to_string()
}
#[cfg(test)]
mod tests {
use super::sanitize;
#[test]
fn test_sanitize() {
assert_eq!(sanitize("\tabc\tefg\t"), "abc efg".to_string());
assert_eq!(sanitize("abc\u{0019}efg"), "abcefg".to_string());
assert_eq!(sanitize("abc:\\/|?~=efg"), "abc_=efg".to_string());
assert_eq!(
sanitize("abc<>\"*<>#%{}^[]+[]`efg"),
"abc []+[] efg".to_string()
);
assert_eq!(
sanitize("-_ \tabc \t >_-\n efg \t_-"),
"abc-efg".to_string()
);
assert_eq!(sanitize("abc\nefg"), "abc-efg".to_string());
assert_eq!(sanitize("abc\r\nefg"), "abc-efg".to_string());
assert_eq!(sanitize("abc_ __ efg __hij"), "abc_ efg hij".to_string());
assert_eq!(
sanitize("https://blog.getreu.net/projects/"),
"https_blog.getreu.net_projects".to_string()
);
}
static INPUT: &'static [&'static str] = &[
"the quick brown fox jumped over the lazy dog",
"résumé",
"hello\u{0000}world",
"hello\nworld",
";-_hello.,\n,.world_-;",
"semi;colon",
";leading-semi",
"com,ma",
"equals=",
"slash\\",
"slash/",
"col:on",
"star*",
"question?",
"quote\"",
"singlequote'",
"brack<e>ts",
"p|pes",
"plus+",
"'five and six<seven'",
" space at front",
"space at end ",
".period",
"period.",
"relative/path/to/some/dir",
"/abs/path/to/some/dir",
"~/.\u{0000}notssh/authorized_keys",
"",
"h?w",
"h/w",
"h*w",
".",
"..",
"./",
"../",
"/..",
"/../",
"*.|.",
"./",
"./foobar",
"../foobar",
"../../foobar",
"./././foobar",
"|*.what",
"LPT9.asdf",
"author| title",
"author | title",
"author: title",
"auteur : titre",
"author, title",
"no , enumeration",
"Any questions? Or not?",
"Des questions ? Ou pas ?",
"Hello!",
"filename(1).ext",
"1,23",
"1.23",
"foo\u{200b}bar",
];
static EXPECTED_OUTPUT: &'static [&'static str] = &[
"the quick brown fox jumped over the lazy dog",
"résumé",
"helloworld",
"hello-world",
"hello-world",
"semi;colon",
"leading-semi",
"com,ma",
"equals=",
"slash",
"slash",
"col_on",
"star",
"question",
"quote",
"singlequote'",
"brack e ts",
"p_pes",
"plus+",
"'five and six seven'",
"space at front",
"space at end",
"period",
"period",
"relative_path_to_some_dir",
"abs_path_to_some_dir",
"notssh_authorized_keys",
"",
"h_w",
"h_w",
"h w",
"",
"",
"",
"",
"",
"",
"",
"",
"foobar",
"foobar",
"foobar",
"foobar",
"what",
"LPT9.asdf",
"author_ title",
"author _ title",
"author_ title",
"auteur _ titre",
"author, title",
"no enumeration",
"Any questions_ Or not",
"Des questions _ Ou pas",
"Hello!",
"filename(1).ext",
"1,23",
"1.23",
"foobar",
];
#[test]
fn test_string_list() {
for (i, s) in INPUT.iter().enumerate() {
assert_eq!(EXPECTED_OUTPUT[i], super::sanitize(s));
}
}
}