1use lazy_static::lazy_static;
33use regex::Regex;
34
35#[rustfmt::skip]
38const TO_STRIP: [char; 28] = [
39 '\u{00ad}',
40 '\u{00a1}', '\u{00bf}',
42 '\u{00ab}', '\u{00bb}', '\u{2039}', '\u{203a}',
44 '\u{2018}', '\u{2019}', '\u{201a}', '\u{201b}', '\u{201c}', '\u{201d}', '\u{201e}', '\u{201f}', '\u{2022}',
46 '\u{00a9}', '\u{00ae}', '\u{00b0}', '\u{2026}', '\u{2122}',
48 '\u{00b4}', '\u{02ca}', '\u{0301}', '\u{0341}',
50 '\u{0300}', '\u{0304}', '\u{030c}',
52];
53
54const TO_REWRITE: [&str; 10] = [
55 " ", " ", "–", "&8211;", "—", "—", "\u{00a0}", "\u{2013}", "\u{2014}", "-",
56];
57
58macro_rules! big_collection {
59 ( $ty:ident, $fnn:ident ) => {
60 #[inline]
61 fn $fnn() -> String {
62 r"(".to_string()
63 + &($ty
64 .iter()
65 .map(|s| s.to_string())
66 .collect::<Vec<String>>()
67 .join(r"|"))
68 + &r")+".to_string()
69 }
70 };
71}
72
73macro_rules! mk_workspace {
74 ( $v:ident, $( $tag:ident, $rep:literal, )* ) => {
75 $(let $v = $tag.replace_all(&$v, $rep); )*
76 }
77}
78
79macro_rules! extra_lazy {
80 ( $( $y:ident, $r:expr, )* ) => {
81 lazy_static! {
82 $(static ref $y: Regex = Regex::new($r).unwrap();)*
83 }
84 }
85}
86
87big_collection!(TO_STRIP, mk_strip);
88big_collection!(TO_REWRITE, mk_rewrite);
89
90const SCRIPT_AND_STYLE: &str = r"(<script[^>]*?>.*?</script>|<style[^>]*?>.*?</style>)";
91
92pub fn sanitize_and_split(title: &str) -> Vec<String> {
98 #[rustfmt::skip]
99 extra_lazy! {
100 STRIP_DANGEROUS_TAGS, SCRIPT_AND_STYLE,
101 REMOVE_TAGS, r"<[^>]*?>",
102 REMOVE_SOFT_PUNCT, &mk_strip(),
103 REWRITE_SOFT_PUNCT, &mk_rewrite(),
104 REMOVE_REMAINING_ENTITIES, r"&.+?;",
105 REWRITE_ACCEPTABLE_PUNCT, r"[\.\?!;:_@\r\n]+",
106 REMOVE_REMAINING_PUNCT, r"[^%\p{Alphabetic}0-9 -]+",
107 }
108
109 let workspace = title.to_string().to_lowercase();
110
111 #[rustfmt::skip]
112 mk_workspace!(
113 workspace,
114 STRIP_DANGEROUS_TAGS, "",
115 REMOVE_TAGS, "",
116 REMOVE_SOFT_PUNCT, "",
117 REWRITE_SOFT_PUNCT, "-",
118 REMOVE_REMAINING_ENTITIES, "",
119 REWRITE_ACCEPTABLE_PUNCT, "-",
120 REMOVE_REMAINING_PUNCT, "",
121 );
122
123 workspace
124 .split(|c| c == ' ' || c == '-')
125 .filter(|s| !s.is_empty())
126 .map(|s| s.to_string())
127 .collect()
128}
129
130pub fn slugify(title: &str) -> String {
133 sanitize_and_split(title).join("-")
134}
135
136#[cfg(test)]
137mod tests {
138 use super::*;
139
140 const SAMPLES: [(&str, &str); 10] = [
141 ("This is a test.", "this-is-a-test"),
142 ("This is a <script>alert('!')</script> test", "this-is-a-test"),
143 ("this is a <em>test</em>", "this-is-a-test"),
144 (" this is --- a <em>test</em> ", "this-is-a-test"),
145 ("Excellent!!!1!1", "excellent-1-1"),
146 ("make\nit work?", "make-it-work"),
147 ("Töxic Tësticle Färm?", "töxic-tësticle-färm"), (" ----You--and--_-_me", "you-and-me"),
149 ("Boys & Girls & Those Elsewhere", "boys-girls-those-elsewhere"),
150 ("user@example.com", "user-example-com"),
151 ];
152
153 #[test]
154 fn basic_checks() {
155 for sample in SAMPLES.iter() {
156 assert_eq!(slugify(sample.0), sample.1);
157 }
158 }
159}