slug_intl/
lib.rs

1#![doc = include_str!("../README.md")]
2
3use finl_unicode::categories::{CharacterCategories, MajorCategory};
4use unicode_normalization::UnicodeNormalization;
5
6/// Converts `str` to a `String` suitable for use as a URL path component.
7///
8/// It first normalizes Unicode as NFC, then makes some aesthetic conversions:
9/// * letters are lowercased where applicable
10/// * whitespace and punctuation are replaced with hyphen
11/// * repeated hyphens are collapsed
12/// * leading and trailing hyphens are removed
13///
14/// Unlike other similar libraries, this maintains non-ASCII Unicode characters, which are well-
15/// supported by current browsers using percent-encoding. Percent-encoding is left as an exercise
16/// for the caller (because you likely want to store the raw UTF-8 in your database, say).
17///
18/// ## Examples
19///
20/// ASCII-only input deals mainly with capitalization, punctuation, and whitespace:
21/// ```rust
22/// # use slug_intl::slugify;
23/// assert_eq!("hello", slugify("Hello"));
24/// assert_eq!("hello-world", slugify("Hello World"));
25/// assert_eq!("hello-world", slugify("Hello World!"));
26/// assert_eq!("hello-world", slugify("/?&#Hello\n\r\n\r   --World!!!"));
27/// ```
28///
29/// Printable Unicode characters are normalized but otherwise preserved:
30/// ```rust
31/// # use slug_intl::slugify;
32/// assert_eq!("おはよう-世界", slugify("おはよう、世界!!"));
33/// assert_eq!("おはよう🐠", slugify("おはよう🐠"));
34/// /// Hyphen replacement is based on what Unicode considers "punctuation":
35/// assert_eq!("1≈3∞5=-£9", slugify("¡¡1≈3∞5=¶£9!!"));
36/// /// Unicode is normalized as NFC
37/// assert_eq!("am\u{00e9}lie", slugify("ame\u{0301}lie"));
38/// ```
39///
40/// You may want to percent-escape the output when rendering HTML, e.g. with the
41/// [urlencoding](https://crates.io/crates/urlencoding) crate:
42/// ```rust
43/// # use slug_intl::slugify;
44/// assert_eq!("hello-%F0%9F%90%A0", urlencoding::encode(&slugify("Hello 🐠")));
45/// ```
46///
47pub fn slugify(str: &str) -> String {
48    let mut prev_hyphen = true; // removes leading hyphens by starting true
49
50    let mut process_char = |c: char| match c.get_major_category() {
51        MajorCategory::L => {
52            prev_hyphen = false;
53            c.to_lowercase().to_string()
54        }
55        MajorCategory::M | MajorCategory::N | MajorCategory::S => {
56            prev_hyphen = false;
57            c.to_string()
58        }
59        MajorCategory::P | MajorCategory::Z | MajorCategory::C => {
60            if prev_hyphen {
61                "".to_string()
62            } else {
63                prev_hyphen = true;
64                "-".to_string()
65            }
66        }
67    };
68
69    // TODO: can we make this more efficient with less copying?
70    str.nfc()
71        .flat_map(|c| process_char(c).chars().collect::<Vec<_>>())
72        .collect::<String>()
73        .trim_end_matches("-")
74        .to_string()
75}