slugrs 0.3.1

A fast, locale-aware slugify library for Rust
Documentation
use deunicode::deunicode;
use regex::Regex;
use unicode_categories::UnicodeCategories;

use crate::options::Options;

/// Slugifier processes strings into URL-friendly slugs.
#[derive(Debug, Clone)]
pub struct Slugifier {
    options: Options,
}

impl Slugifier {
    pub fn new() -> Self {
        Self {
            options: Options::default(),
        }
    }

    pub fn with_options(options: Options) -> Self {
        Self { options }
    }

    pub fn options(&self) -> &Options {
        &self.options
    }

    pub fn options_mut(&mut self) -> &mut Options {
        &mut self.options
    }

    pub fn separator(mut self, separator: impl Into<String>) -> Self {
        self.options.separator = separator.into();
        self
    }

    pub fn locale(mut self, locale: Option<crate::Locale>) -> Self {
        self.options.locale = locale;
        self
    }

    pub fn remove(mut self, remove: Option<Regex>) -> Self {
        self.options.remove = remove;
        self
    }

    pub fn lowercase(mut self, lowercase: bool) -> Self {
        self.options.lowercase = lowercase;
        self
    }

    pub fn trim(mut self, trim: bool) -> Self {
        self.options.trim = trim;
        self
    }

    pub fn max_length(mut self, max_length: Option<usize>) -> Self {
        self.options.max_length = max_length;
        self
    }

    pub fn slugify(&self, input: &str) -> String {
        slugify_impl(input, &self.options)
    }
}

impl Default for Slugifier {
    fn default() -> Self {
        Self::new()
    }
}

pub fn slugify_with_options(input: &str, options: &Options) -> String {
    slugify_impl(input, options)
}

fn slugify_impl(input: &str, options: &Options) -> String {
    // Pre-remove using custom regex if provided
    let mut preprocessed = input;
    let owned_removed;
    if let Some(ref remove_regex) = options.remove {
        owned_removed = remove_regex.replace_all(input, "").into_owned();
        preprocessed = &owned_removed;
    }

    // Locale-specific mappings first
    let localized = if let Some(locale) = options.locale {
        locale.apply(preprocessed)
    } else {
        preprocessed.to_string()
    };

    // Single-pass builder
    let mut builder = String::with_capacity(localized.len());
    let mut prev_was_sep = false;
    let mut bytes_so_far = 0usize;
    let sep = options.separator.as_str();

    let push_sep = |out: &mut String, prev_flag: &mut bool| {
        if !*prev_flag {
            out.push_str(sep);
            *prev_flag = true;
        }
    };

    for ch in localized.chars() {
        // Drop emoji/symbols
        if options.drop_emoji && !(ch.is_letter() || ch.is_number() || ch.is_whitespace()) {
            if ch == '-' || ch == '_' {
                // consider length before pushing separator
                let add_len = sep.len();
                if options
                    .max_length
                    .map(|max_len| bytes_so_far + add_len > max_len)
                    .unwrap_or(false)
                {
                    break;
                }
                push_sep(&mut builder, &mut prev_was_sep);
                bytes_so_far += add_len;
            }
            continue;
        }

        // Drop apostrophes
        if options.drop_apostrophes && (ch == '\'' || ch == '') {
            continue;
        }

        if ch.is_alphanumeric() {
            if ch.is_ascii() {
                // Fast path: ASCII alphanumeric
                if options
                    .max_length
                    .map(|max_len| bytes_so_far + 1 > max_len)
                    .unwrap_or(false)
                {
                    break;
                }
                builder.push(ch);
                prev_was_sep = false;
                bytes_so_far += 1;
            } else {
                // Non-ASCII: transliterate single char using a tiny stack buffer
                let mut buf = [0u8; 4];
                let s = ch.encode_utf8(&mut buf);
                let ascii = deunicode(s);
                if ascii.is_empty() {
                    continue;
                }
                let add_len = ascii.len();
                if options
                    .max_length
                    .map(|max_len| bytes_so_far + add_len > max_len)
                    .unwrap_or(false)
                {
                    break;
                }
                builder.push_str(&ascii);
                prev_was_sep = false;
                bytes_so_far += add_len;
            }
        } else {
            // Any non-alnum becomes a separator
            let add_len = sep.len();
            if options
                .max_length
                .map(|max_len| bytes_so_far + add_len > max_len)
                .unwrap_or(false)
            {
                break;
            }
            push_sep(&mut builder, &mut prev_was_sep);
            bytes_so_far += add_len;
        }
    }

    // Trim leading/trailing separators accurately for multi-char sep
    let mut text = builder;
    if options.trim {
        while text.starts_with(sep) {
            text.drain(..sep.len());
        }
        while text.ends_with(sep) {
            let new_len = text.len() - sep.len();
            text.truncate(new_len);
        }
    }

    if options.lowercase {
        text = text.to_lowercase();
    }

    text
}