github_slugger/
lib.rs

1//! Generate header slugs for GitHub Markdown.
2//! Adapted from https://github.com/Flet/github-slugger
3
4use once_cell::sync::Lazy;
5use regex::Regex;
6use std::collections::HashSet;
7
8#[derive(Default, Debug)]
9pub struct Slugger {
10    /// The set of slugs we've seen so far
11    slugs: HashSet<String>,
12}
13
14// See https://github.com/rust-lang/regex/blob/master/UNICODE.md#rl12-properties
15// and https://www.compart.com/en/unicode/category/So
16static REMOVE_PAT: &str = r"[\p{Other_Number}\p{Close_Punctuation}\p{Final_Punctuation}\p{Initial_Punctuation}\p{Open_Punctuation}\p{Other_Punctuation}\p{Dash_Punctuation}\p{Symbol}\p{Control}\p{Private_Use}\p{Format}\p{Unassigned}\p{Separator}]";
17static REMOVE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(REMOVE_PAT).unwrap());
18
19impl Slugger {
20    /// Generate a slug for the given string.
21    pub fn slug(&mut self, s: &str) -> String {
22        // if we've already seen this slug, add a number to the end
23        let base = slug(s);
24        let mut result = base.clone();
25        let mut i = 1;
26        while self.slugs.contains(&result) {
27            result = format!("{}-{}", base, i);
28            i += 1;
29        }
30
31        self.slugs.insert(result.clone());
32        result
33    }
34
35    /// Clear the set of slugs we've seen so far.
36    pub fn reset(&mut self) {
37        self.slugs.clear();
38    }
39}
40
41pub fn slug(input: &str) -> String {
42    let s = input.to_lowercase();
43
44    // apply function to regex matches
45    let s = REMOVE_RE.replace_all(&s, |caps: &regex::Captures| {
46        let c = caps.get(0).unwrap().as_str();
47        if c == " " || c == "-" {
48            "-".to_string()
49        } else if c.chars().all(|a| a.is_alphabetic()) {
50            // note in "Other Symbols" this matches:
51            // ⓐⓑⓒⓓⓔⓕⓖⓗⓘⓙⓚⓛⓜⓝⓞⓟⓠⓡⓢⓣⓤⓥⓦⓧⓨⓩ
52            // ⓐⓑⓒⓓⓔⓕⓖⓗⓘⓙⓚⓛⓜⓝⓞⓟⓠⓡⓢⓣⓤⓥⓦⓧⓨⓩ
53            // 🄰🄱🄲🄳🄴🄵🄶🄷🄸🄹🄺🄻🄼🄽🄾🄿🅀🅁🅂🅃🅄🅅🅆🅇🅈🅉
54            // 🅐🅑🅒🅓🅔🅕🅖🅗🅘🅙🅚🅛🅜🅝🅞🅟🅠🅡🅢🅣🅤🅥🅦🅧🅨🅩
55            // 🅰🅱🅲🅳🅴🅵🅶🅷🅸🅹🅺🅻🅼🅽🅾🅿🆀🆁🆂🆃🆄🆅🆆🆇🆈🆉
56            c.to_string()
57        } else {
58            "".to_string()
59        }
60    });
61    s.replace(|c: char| c.is_whitespace(), "-")
62}