slug_intl/lib.rs
1#![doc = include_str!("../README.md")]
2
3use finl_unicode::categories::{CharacterCategories, MajorCategory};
4use unicode_normalization::UnicodeNormalization;
5
6/// Converts `str` to a `String` suitable for use as a URL path component.
7///
8/// It first normalizes Unicode as NFC, then makes some aesthetic conversions:
9/// * letters are lowercased where applicable
10/// * whitespace and punctuation are replaced with hyphen
11/// * repeated hyphens are collapsed
12/// * leading and trailing hyphens are removed
13///
14/// Unlike other similar libraries, this maintains non-ASCII Unicode characters, which are well-
15/// supported by current browsers using percent-encoding. Percent-encoding is left as an exercise
16/// for the caller (because you likely want to store the raw UTF-8 in your database, say).
17///
18/// ## Examples
19///
20/// ASCII-only input deals mainly with capitalization, punctuation, and whitespace:
21/// ```rust
22/// # use slug_intl::slugify;
23/// assert_eq!("hello", slugify("Hello"));
24/// assert_eq!("hello-world", slugify("Hello World"));
25/// assert_eq!("hello-world", slugify("Hello World!"));
26/// assert_eq!("hello-world", slugify("/?&#Hello\n\r\n\r --World!!!"));
27/// ```
28///
29/// Printable Unicode characters are normalized but otherwise preserved:
30/// ```rust
31/// # use slug_intl::slugify;
32/// assert_eq!("おはよう-世界", slugify("おはよう、世界!!"));
33/// assert_eq!("おはよう🐠", slugify("おはよう🐠"));
34/// /// Hyphen replacement is based on what Unicode considers "punctuation":
35/// assert_eq!("1≈3∞5=-£9", slugify("¡¡1≈3∞5=¶£9!!"));
36/// /// Unicode is normalized as NFC
37/// assert_eq!("am\u{00e9}lie", slugify("ame\u{0301}lie"));
38/// ```
39///
40/// You may want to percent-escape the output when rendering HTML, e.g. with the
41/// [urlencoding](https://crates.io/crates/urlencoding) crate:
42/// ```rust
43/// # use slug_intl::slugify;
44/// assert_eq!("hello-%F0%9F%90%A0", urlencoding::encode(&slugify("Hello 🐠")));
45/// ```
46///
47pub fn slugify(str: &str) -> String {
48 let mut prev_hyphen = true; // removes leading hyphens by starting true
49
50 let mut process_char = |c: char| match c.get_major_category() {
51 MajorCategory::L => {
52 prev_hyphen = false;
53 c.to_lowercase().to_string()
54 }
55 MajorCategory::M | MajorCategory::N | MajorCategory::S => {
56 prev_hyphen = false;
57 c.to_string()
58 }
59 MajorCategory::P | MajorCategory::Z | MajorCategory::C => {
60 if prev_hyphen {
61 "".to_string()
62 } else {
63 prev_hyphen = true;
64 "-".to_string()
65 }
66 }
67 };
68
69 // TODO: can we make this more efficient with less copying?
70 str.nfc()
71 .flat_map(|c| process_char(c).chars().collect::<Vec<_>>())
72 .collect::<String>()
73 .trim_end_matches("-")
74 .to_string()
75}