use std::collections::{HashMap, HashSet};
use pyo3::prelude::*;
use crate::limits::MAX_UNIQUE_ATTEMPTS;
use crate::slugify::{slugify_impl, slugify_impl_with_stopset, SlugConfig};
use crate::utils::floor_char_boundary;
#[pyfunction]
#[pyo3(signature = (
text,
*,
separator="-",
lowercase=true,
max_length=0,
word_boundary=false,
save_order=false,
stopwords=vec![],
regex_pattern=None,
replacements=vec![],
allow_unicode=false,
lang=None,
entities=true,
decimal=true,
hexadecimal=true,
))]
pub fn _slugify(
text: &str,
separator: &str,
lowercase: bool,
max_length: i64,
word_boundary: bool,
save_order: bool,
stopwords: Vec<String>,
regex_pattern: Option<&str>,
replacements: Vec<(String, String)>,
allow_unicode: bool,
lang: Option<&str>,
entities: bool,
decimal: bool,
hexadecimal: bool,
) -> PyResult<String> {
crate::transliterate::validate_lang(lang)?;
let max_length = crate::error::checked_max_length(max_length)?;
let config = SlugConfig::from_pyargs(
separator,
lowercase,
max_length,
word_boundary,
save_order,
stopwords,
regex_pattern,
replacements,
allow_unicode,
lang,
entities,
decimal,
hexadecimal,
)
.map_err(pyo3::PyErr::from)?;
Ok(slugify_impl(text, &config))
}
#[pyfunction]
#[pyo3(signature = (
texts,
*,
separator="-",
lowercase=true,
max_length=0,
word_boundary=false,
save_order=false,
stopwords=vec![],
regex_pattern=None,
replacements=vec![],
allow_unicode=false,
lang=None,
entities=true,
decimal=true,
hexadecimal=true,
))]
pub fn _slugify_batch(
py: Python<'_>,
texts: &Bound<'_, pyo3::types::PyList>,
separator: &str,
lowercase: bool,
max_length: i64,
word_boundary: bool,
save_order: bool,
stopwords: Vec<String>,
regex_pattern: Option<&str>,
replacements: Vec<(String, String)>,
allow_unicode: bool,
lang: Option<&str>,
entities: bool,
decimal: bool,
hexadecimal: bool,
) -> PyResult<Vec<String>> {
let texts = texts.to_tuple();
let len = texts.len();
if len > crate::MAX_BATCH_SIZE {
return Err(crate::ErrorRepr::BatchTooLarge {
len,
max: crate::MAX_BATCH_SIZE,
}
.into());
}
crate::transliterate::validate_lang(lang)?;
let max_length = crate::error::checked_max_length(max_length)?;
let config = SlugConfig::from_pyargs(
separator,
lowercase,
max_length,
word_boundary,
save_order,
stopwords,
regex_pattern,
replacements,
allow_unicode,
lang,
entities,
decimal,
hexadecimal,
)
.map_err(pyo3::PyErr::from)?;
let stopset: HashSet<String> = config.stopwords.iter().cloned().collect();
let mut out: Vec<String> = Vec::with_capacity(len);
let mut start = 0;
while start < len {
let end = (start + crate::BATCH_CHUNK_SIZE).min(len);
let mut chunk: Vec<String> = Vec::with_capacity(end - start);
for i in start..end {
chunk.push(texts.get_item(i)?.extract::<String>()?);
}
let processed: Vec<String> = py.detach(|| {
chunk
.iter()
.map(|text| slugify_impl_with_stopset(text, &config, Some(&stopset)))
.collect()
});
out.extend(processed);
start = end;
}
Ok(out)
}
#[pyclass]
#[pyo3(name = "_Slugifier")]
pub struct _Slugifier {
config: SlugConfig,
stopset: HashSet<String>,
}
#[pymethods]
impl _Slugifier {
#[new]
#[pyo3(signature = (
*,
separator="-",
lowercase=true,
max_length=0,
word_boundary=false,
save_order=false,
stopwords=vec![],
regex_pattern=None,
replacements=vec![],
allow_unicode=false,
lang=None,
entities=true,
decimal=true,
hexadecimal=true,
safe_chars="",
))]
fn new(
separator: &str,
lowercase: bool,
max_length: i64,
word_boundary: bool,
save_order: bool,
stopwords: Vec<String>,
regex_pattern: Option<&str>,
replacements: Vec<(String, String)>,
allow_unicode: bool,
lang: Option<&str>,
entities: bool,
decimal: bool,
hexadecimal: bool,
safe_chars: &str,
) -> PyResult<Self> {
crate::transliterate::validate_lang(lang)?;
let max_length = crate::error::checked_max_length(max_length)?;
let mut config = SlugConfig::from_pyargs(
separator,
lowercase,
max_length,
word_boundary,
save_order,
stopwords,
regex_pattern,
replacements,
allow_unicode,
lang,
entities,
decimal,
hexadecimal,
)
.map_err(pyo3::PyErr::from)?;
safe_chars.clone_into(&mut config.safe_chars);
let stopset: HashSet<String> = config.stopwords.iter().cloned().collect();
Ok(Self { config, stopset })
}
fn slugify(&self, text: &str) -> String {
slugify_impl_with_stopset(text, &self.config, Some(&self.stopset))
}
#[getter]
fn separator(&self) -> &str {
&self.config.separator
}
#[getter]
fn lang(&self) -> Option<&str> {
self.config.lang.as_deref()
}
}
#[pyclass]
#[pyo3(name = "_UniqueSlugifier")]
pub struct _UniqueSlugifier {
inner: _Slugifier,
seen: HashSet<String>,
check: Option<Py<PyAny>>,
next_counter: HashMap<String, u64>,
}
fn build_unique_candidate(base: &str, counter: u64, config: &SlugConfig) -> (String, bool) {
if counter == 0 {
return (base.to_owned(), false);
}
let sep = &config.separator;
let mut candidate = format!("{base}{sep}{counter}");
let mut lossy = false;
if config.max_length > 0 && candidate.len() > config.max_length {
let suffix = format!("{sep}{counter}");
if suffix.len() >= config.max_length {
let boundary = floor_char_boundary(&suffix, config.max_length);
lossy = boundary < suffix.len();
suffix[..boundary].clone_into(&mut candidate);
} else {
let avail = config.max_length - suffix.len();
let boundary = floor_char_boundary(base, avail);
candidate = format!("{}{suffix}", &base[..boundary]);
}
}
(candidate, lossy)
}
#[pymethods]
impl _UniqueSlugifier {
#[new]
#[pyo3(signature = (
*,
check=None,
separator="-",
lowercase=true,
max_length=0,
word_boundary=false,
save_order=false,
stopwords=vec![],
regex_pattern=None,
replacements=vec![],
allow_unicode=false,
lang=None,
entities=true,
decimal=true,
hexadecimal=true,
safe_chars="",
))]
fn new(
check: Option<Py<PyAny>>,
separator: &str,
lowercase: bool,
max_length: i64,
word_boundary: bool,
save_order: bool,
stopwords: Vec<String>,
regex_pattern: Option<&str>,
replacements: Vec<(String, String)>,
allow_unicode: bool,
lang: Option<&str>,
entities: bool,
decimal: bool,
hexadecimal: bool,
safe_chars: &str,
) -> PyResult<Self> {
let inner = _Slugifier::new(
separator,
lowercase,
max_length,
word_boundary,
save_order,
stopwords,
regex_pattern,
replacements,
allow_unicode,
lang,
entities,
decimal,
hexadecimal,
safe_chars,
)?;
Ok(Self {
inner,
seen: HashSet::new(),
check,
next_counter: HashMap::new(),
})
}
fn slugify(&mut self, py: Python<'_>, text: &str) -> PyResult<String> {
let base = self.inner.slugify(text);
let use_hint = self.check.is_none();
let mut counter: u64 = if use_hint {
self.next_counter.get(&base).copied().unwrap_or(0)
} else {
0
};
let config = &self.inner.config;
let mut saw_lossy = false;
loop {
if counter > MAX_UNIQUE_ATTEMPTS {
if saw_lossy {
tl_warn!(
"unique_slug_max_length_too_small: max_length={} sep_len={}",
config.max_length,
config.separator.len()
);
return Err(crate::ErrorRepr::UniqueSlugMaxLengthTooSmall {
max_length: config.max_length,
separator: config.separator.clone(),
min_unique_len: config.separator.len() + 1,
}
.into());
}
tl_warn!("unique_slug_attempts_exceeded: max={MAX_UNIQUE_ATTEMPTS}");
return Err(crate::ErrorRepr::UniqueSlugAttemptsExceeded {
max: MAX_UNIQUE_ATTEMPTS,
text: text.to_owned(),
}
.into());
}
if counter >= 1 {
let min_unique_len = config.separator.len() + 1;
if config.max_length > 0 && config.max_length < min_unique_len {
tl_warn!(
"unique_slug_max_length_too_small: max_length={} min_unique_len={min_unique_len}",
config.max_length
);
return Err(crate::ErrorRepr::UniqueSlugMaxLengthTooSmall {
max_length: config.max_length,
separator: config.separator.clone(),
min_unique_len,
}
.into());
}
}
let (candidate, lossy) = build_unique_candidate(&base, counter, config);
saw_lossy |= lossy;
if !self.seen.contains(&candidate) {
let free = match self.check.as_ref() {
Some(check_fn) => !check_fn.call1(py, (&candidate,))?.extract::<bool>(py)?,
None => true,
};
if free {
self.seen.insert(candidate.clone());
if use_hint {
self.next_counter.insert(base, counter + 1);
}
return Ok(candidate);
}
}
counter += 1;
}
}
fn reset(&mut self) {
self.seen.clear();
self.next_counter.clear();
}
}