Skip to main content

wpslugify/
lib.rs

1// This Source Code Form is subject to the terms of the Mozilla Public
2// License, v. 2.0. If a copy of the MPL was not distributed with this
3// file, You can obtain one at http://mozilla.org/MPL/2.0/.
4
5//! # Wordpress-style slugification
6//!
7//! This library provides a simple slugification algorithm that is a
8//! close-as-possible direct port of the WordPress
9//! [`sanitize_title_with_dashes()`](https://developer.wordpress.org/reference/functions/sanitize_title_with_dashes/)
10//! function that WordPress uses to generate
11//! [slugs](https://en.wikipedia.org/wiki/Clean_URL#Slug).
12//!
13//! Examples:
14//!
15//! ```
16//! # use wpslugify::slugify;
17//! assert_eq!(slugify("This is a test."), "this-is-a-test");
18//! assert_eq!(slugify("This is a <script>alertslugify('!')</script> test"), "this-is-a-test");
19//! assert_eq!(slugify("Excellent!!!1!1"), "excellent-1-1");
20//! assert_eq!(slugify("user@example.com"), "user-example-com");
21//! ```
22//!
23//! This slugification feature leaves UTF-8 that is within the Unicode
24//! `{Alphabetic}` properties class intact.  Yes, there's really a
25//! band with this name:
26//!
27//! ```
28//! # use wpslugify::slugify;
29//! assert_eq!(slugify("Töxic Tësticle Färm?"), "töxic-tësticle-färm");
30//! ```
31
32use lazy_static::lazy_static;
33use regex::Regex;
34
35// Rustfmt really wants to put some of my comments at the end of
36// lines.
37#[rustfmt::skip]
38const TO_STRIP: [char; 28] = [
39    '\u{00ad}', 
40	// &iexcl and &iquest.
41    '\u{00a1}', '\u{00bf}', 
42	// Angle quotes.
43    '\u{00ab}', '\u{00bb}', '\u{2039}', '\u{203a}', 
44	// Curly quotes.
45    '\u{2018}', '\u{2019}', '\u{201a}', '\u{201b}', '\u{201c}', '\u{201d}', '\u{201e}', '\u{201f}', '\u{2022}',
46    // &copy, &reg, &deg, &hellip, and &trade.
47    '\u{00a9}', '\u{00ae}', '\u{00b0}', '\u{2026}', '\u{2122}', 
48	// Acute accents.
49    '\u{00b4}', '\u{02ca}', '\u{0301}', '\u{0341}', 
50	// Grave accent, macron, caron.
51    '\u{0300}', '\u{0304}', '\u{030c}',
52];
53
54const TO_REWRITE: [&str; 10] = [
55    "&nbsp;", "&#160;", "&ndash;", "&8211;", "&mdash;", "&#8212;", "\u{00a0}", "\u{2013}", "\u{2014}", "-",
56];
57
58macro_rules! big_collection {
59    ( $ty:ident, $fnn:ident ) => {
60        #[inline]
61        fn $fnn() -> String {
62            r"(".to_string()
63                + &($ty
64                    .iter()
65                    .map(|s| s.to_string())
66                    .collect::<Vec<String>>()
67                    .join(r"|"))
68                + &r")+".to_string()
69        }
70    };
71}
72
73macro_rules! mk_workspace {
74	( $v:ident, $( $tag:ident, $rep:literal, )* ) => {
75		$(let $v = $tag.replace_all(&$v, $rep); )*
76	}
77}
78
79macro_rules! extra_lazy {
80	( $( $y:ident, $r:expr, )* ) => {
81		lazy_static! {
82			$(static ref $y: Regex = Regex::new($r).unwrap();)*
83		}
84	}
85}
86
87big_collection!(TO_STRIP, mk_strip);
88big_collection!(TO_REWRITE, mk_rewrite);
89
90const SCRIPT_AND_STYLE: &str = r"(<script[^>]*?>.*?</script>|<style[^>]*?>.*?</style>)";
91
92/// Sanitize a string and return an array of the atomized words, all
93/// lowercased.  This function is here because there are other uses
94/// for slugified titles than just as slugs, and clients may want to
95/// limit the length of a slug, remove stopwords or just "a|an|the"
96/// language articles, or other modifications.
97pub fn sanitize_and_split(title: &str) -> Vec<String> {
98    #[rustfmt::skip]
99    extra_lazy! {
100        STRIP_DANGEROUS_TAGS, SCRIPT_AND_STYLE,
101        REMOVE_TAGS, r"<[^>]*?>",
102        REMOVE_SOFT_PUNCT, &mk_strip(),
103        REWRITE_SOFT_PUNCT, &mk_rewrite(),
104        REMOVE_REMAINING_ENTITIES, r"&.+?;",
105		REWRITE_ACCEPTABLE_PUNCT, r"[\.\?!;:_@\r\n]+",
106        REMOVE_REMAINING_PUNCT, r"[^%\p{Alphabetic}0-9 -]+",
107    }
108
109    let workspace = title.to_string().to_lowercase();
110
111    #[rustfmt::skip]
112    mk_workspace!(
113		workspace,
114        STRIP_DANGEROUS_TAGS, "",
115        REMOVE_TAGS, "",
116        REMOVE_SOFT_PUNCT, "",
117        REWRITE_SOFT_PUNCT, "-",
118        REMOVE_REMAINING_ENTITIES, "",
119        REWRITE_ACCEPTABLE_PUNCT, "-",
120		REMOVE_REMAINING_PUNCT, "",
121    );
122
123    workspace
124        .split(|c| c == ' ' || c == '-')
125        .filter(|s| !s.is_empty())
126        .map(|s| s.to_string())
127        .collect()
128}
129
130/// Sanitize a string and return the string lowercased with a single
131/// hyphen between the words.
132pub fn slugify(title: &str) -> String {
133    sanitize_and_split(title).join("-")
134}
135
136#[cfg(test)]
137mod tests {
138    use super::*;
139
140    const SAMPLES: [(&str, &str); 10] = [
141        ("This is a test.", "this-is-a-test"),
142        ("This is a <script>alert('!')</script> test", "this-is-a-test"),
143        ("this is a <em>test</em>", "this-is-a-test"),
144        ("        this    is --- a       <em>test</em>        ", "this-is-a-test"),
145        ("Excellent!!!1!1", "excellent-1-1"),
146        ("make\nit   work?", "make-it-work"),
147        ("Töxic Tësticle Färm?", "töxic-tësticle-färm"), // Yes, that's a real band.
148        ("  ----You--and--_-_me", "you-and-me"),
149        ("Boys & Girls & Those Elsewhere", "boys-girls-those-elsewhere"),
150        ("user@example.com", "user-example-com"),
151    ];
152
153    #[test]
154    fn basic_checks() {
155        for sample in SAMPLES.iter() {
156            assert_eq!(slugify(sample.0), sample.1);
157        }
158    }
159}