slugrs 0.3.2

A fast, locale-aware slugify library for Rust
Documentation
use deunicode::deunicode;
use regex::Regex;
use unicode_categories::UnicodeCategories;

use crate::options::Options;

/// Slugifier processes strings into URL-friendly slugs.
#[derive(Debug, Clone)]
pub struct Slugifier {
    options: Options,
}

impl Slugifier {
    pub fn new() -> Self {
        Self {
            options: Options::default(),
        }
    }

    pub fn with_options(options: Options) -> Self {
        Self { options }
    }

    pub fn options(&self) -> &Options {
        &self.options
    }

    pub fn options_mut(&mut self) -> &mut Options {
        &mut self.options
    }

    pub fn separator(mut self, separator: impl Into<String>) -> Self {
        self.options.separator = separator.into();
        self
    }

    pub fn locale(mut self, locale: Option<crate::Locale>) -> Self {
        self.options.locale = locale;
        self
    }

    pub fn remove(mut self, remove: Option<Regex>) -> Self {
        self.options.remove = remove;
        self
    }

    pub fn lowercase(mut self, lowercase: bool) -> Self {
        self.options.lowercase = lowercase;
        self
    }

    pub fn trim(mut self, trim: bool) -> Self {
        self.options.trim = trim;
        self
    }

    pub fn max_length(mut self, max_length: Option<usize>) -> Self {
        self.options.max_length = max_length;
        self
    }

    pub fn slugify(&self, input: &str) -> String {
        slugify_impl(input, &self.options)
    }
}

impl Default for Slugifier {
    fn default() -> Self {
        Self::new()
    }
}

pub fn slugify_with_options(input: &str, options: &Options) -> String {
    slugify_impl(input, options)
}

fn slugify_impl(input: &str, options: &Options) -> String {
    let pre = apply_remove_and_locale(input, options);
    let sep = options.separator.as_str();

    let mut state = BuildState::with_capacity(pre.len());
    for ch in pre.chars() {
        if should_drop_char(ch, options) {
            if (ch == '-' || ch == '_')
                && options.drop_emoji
                && !push_separator(&mut state, sep, options)
            {
                break;
            }
            continue;
        }

        if ch.is_alphanumeric() {
            if ch.is_ascii() {
                if !append_ascii_char(&mut state, ch, options) {
                    break;
                }
            } else if !append_transliterated_char(&mut state, ch, options) {
                break;
            }
        } else if !push_separator(&mut state, sep, options) {
            break;
        }
    }

    let mut text = state.builder;
    if options.trim {
        trim_separators(&mut text, sep);
    }

    if options.lowercase {
        text = text.to_lowercase();
    }

    text
}

struct BuildState {
    builder: String,
    prev_was_sep: bool,
    bytes_so_far: usize,
}

impl BuildState {
    fn with_capacity(cap: usize) -> Self {
        Self {
            builder: String::with_capacity(cap),
            prev_was_sep: false,
            bytes_so_far: 0,
        }
    }
}

fn apply_remove_and_locale(input: &str, options: &Options) -> String {
    let removed = if let Some(ref re) = options.remove {
        re.replace_all(input, "").into_owned()
    } else {
        input.to_string()
    };

    if let Some(locale) = options.locale {
        locale.apply(&removed)
    } else {
        removed
    }
}

fn is_symbol_like(c: char) -> bool {
    !(c.is_letter() || c.is_number() || c.is_whitespace())
}

fn should_drop_char(c: char, options: &Options) -> bool {
    (options.drop_apostrophes && (c == '\'' || c == ''))
        || (options.drop_emoji && is_symbol_like(c) && c != '-' && c != '_')
}

fn would_exceed(current: usize, add_len: usize, options: &Options) -> bool {
    options
        .max_length
        .map(|max_len| current + add_len > max_len)
        .unwrap_or(false)
}

fn push_separator(state: &mut BuildState, sep: &str, options: &Options) -> bool {
    if state.prev_was_sep {
        return true;
    }
    let add_len = sep.len();
    if would_exceed(state.bytes_so_far, add_len, options) {
        return false;
    }
    state.builder.push_str(sep);
    state.prev_was_sep = true;
    state.bytes_so_far += add_len;
    true
}

fn append_ascii_char(state: &mut BuildState, ch: char, options: &Options) -> bool {
    if would_exceed(state.bytes_so_far, 1, options) {
        return false;
    }
    state.builder.push(ch);
    state.prev_was_sep = false;
    state.bytes_so_far += 1;
    true
}

fn append_transliterated_char(state: &mut BuildState, ch: char, options: &Options) -> bool {
    let mut buf = [0u8; 4];
    let s = ch.encode_utf8(&mut buf);
    let ascii = deunicode(s);
    if ascii.is_empty() {
        return true; // nothing to add, but not an error
    }
    let add_len = ascii.len();
    if would_exceed(state.bytes_so_far, add_len, options) {
        return false;
    }
    state.builder.push_str(&ascii);
    state.prev_was_sep = false;
    state.bytes_so_far += add_len;
    true
}

fn trim_separators(text: &mut String, sep: &str) {
    while text.starts_with(sep) {
        text.drain(..sep.len());
    }
    while text.ends_with(sep) {
        let new_len = text.len() - sep.len();
        text.truncate(new_len);
    }
}