use lazy_static::lazy_static;
use regex::Regex;
#[rustfmt::skip]
const TO_STRIP: [char; 28] = [
'\u{00ad}',
'\u{00a1}', '\u{00bf}',
'\u{00ab}', '\u{00bb}', '\u{2039}', '\u{203a}',
'\u{2018}', '\u{2019}', '\u{201a}', '\u{201b}', '\u{201c}', '\u{201d}', '\u{201e}', '\u{201f}', '\u{2022}',
'\u{00a9}', '\u{00ae}', '\u{00b0}', '\u{2026}', '\u{2122}',
'\u{00b4}', '\u{02ca}', '\u{0301}', '\u{0341}',
'\u{0300}', '\u{0304}', '\u{030c}',
];
const TO_REWRITE: [&str; 10] = [
" ", " ", "–", "&8211;", "—", "—", "\u{00a0}", "\u{2013}", "\u{2014}", "-",
];
macro_rules! big_collection {
( $ty:ident, $fnn:ident ) => {
#[inline]
fn $fnn() -> String {
r"(".to_string()
+ &($ty
.iter()
.map(|s| s.to_string())
.collect::<Vec<String>>()
.join(r"|"))
+ &r")+".to_string()
}
};
}
macro_rules! mk_workspace {
( $v:ident, $( $tag:ident, $rep:literal, )* ) => {
$(let $v = $tag.replace_all(&$v, $rep); )*
}
}
macro_rules! extra_lazy {
( $( $y:ident, $r:expr, )* ) => {
lazy_static! {
$(static ref $y: Regex = Regex::new($r).unwrap();)*
}
}
}
big_collection!(TO_STRIP, mk_strip);
big_collection!(TO_REWRITE, mk_rewrite);
const SCRIPT_AND_STYLE: &str = r"(<script[^>]*?>.*?</script>|<style[^>]*?>.*?</style>)";
pub fn sanitize_and_split(title: &str) -> Vec<String> {
#[rustfmt::skip]
extra_lazy! {
STRIP_DANGEROUS_TAGS, SCRIPT_AND_STYLE,
REMOVE_TAGS, r"<[^>]*?>",
REMOVE_SOFT_PUNCT, &mk_strip(),
REWRITE_SOFT_PUNCT, &mk_rewrite(),
REMOVE_REMAINING_ENTITIES, r"&.+?;",
REWRITE_ACCEPTABLE_PUNCT, r"[\.\?!;:_@\r\n]+",
REMOVE_REMAINING_PUNCT, r"[^%\p{Alphabetic}0-9 -]+",
}
let workspace = title.to_string().to_lowercase();
#[rustfmt::skip]
mk_workspace!(
workspace,
STRIP_DANGEROUS_TAGS, "",
REMOVE_TAGS, "",
REMOVE_SOFT_PUNCT, "",
REWRITE_SOFT_PUNCT, "-",
REMOVE_REMAINING_ENTITIES, "",
REWRITE_ACCEPTABLE_PUNCT, "-",
REMOVE_REMAINING_PUNCT, "",
);
workspace
.split(|c| c == ' ' || c == '-')
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
.collect()
}
pub fn slugify(title: &str) -> String {
sanitize_and_split(title).join("-")
}
#[cfg(test)]
mod tests {
use super::*;
const SAMPLES: [(&str, &str); 10] = [
("This is a test.", "this-is-a-test"),
("This is a <script>alert('!')</script> test", "this-is-a-test"),
("this is a <em>test</em>", "this-is-a-test"),
(" this is --- a <em>test</em> ", "this-is-a-test"),
("Excellent!!!1!1", "excellent-1-1"),
("make\nit work?", "make-it-work"),
("Töxic Tësticle Färm?", "töxic-tësticle-färm"),
(" ----You--and--_-_me", "you-and-me"),
("Boys & Girls & Those Elsewhere", "boys-girls-those-elsewhere"),
("user@example.com", "user-example-com"),
];
#[test]
fn basic_checks() {
for sample in SAMPLES.iter() {
assert_eq!(slugify(sample.0), sample.1);
}
}
}