use rust_stemmers::{Algorithm, Stemmer};
use std::sync::OnceLock;
use unicode_normalization::UnicodeNormalization;
#[derive(Debug, Clone)]
pub struct CanonicalText {
pub canon: String,
pub stems: Vec<String>,
}
static STEMMER: OnceLock<Stemmer> = OnceLock::new();
fn stemmer() -> &'static Stemmer {
STEMMER.get_or_init(|| Stemmer::create(Algorithm::English))
}
pub fn canonicalize(text: &str) -> CanonicalText {
let nfkc: String = text
.nfkc()
.map(|c| match c {
'\u{2018}' | '\u{2019}' | '\u{201A}' | '\u{201B}' => '\'',
'\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{201F}' => '"',
'\u{2013}' | '\u{2014}' | '\u{2212}' => '-',
'\u{00A0}' | '\u{2000}'..='\u{200B}' => ' ',
_ => c,
})
.collect();
let folded: String = caseless::default_case_fold_str(&nfkc);
let mut canon = String::with_capacity(folded.len());
let mut prev_space = true;
for c in folded.chars() {
let keep = c.is_alphanumeric() || c == '_' || c == '-' || c == '\'' || c == '`';
if keep {
canon.push(c);
prev_space = false;
} else if c.is_whitespace() && !prev_space {
canon.push(' ');
prev_space = true;
}
}
let canon = canon.trim().to_string();
let stem = stemmer();
let stems: Vec<String> = canon
.split_whitespace()
.map(|tok| {
if tok.contains('_') || tok.chars().any(|c| c.is_ascii_digit()) {
tok.to_string()
} else {
stem.stem(tok).into_owned()
}
})
.collect();
CanonicalText { canon, stems }
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn nfkc_smart_quotes_and_dashes() {
let c = canonicalize("Use \u{201C}snake_case\u{201D} \u{2014} not camelCase.");
assert!(c.canon.contains("snake_case"));
assert!(c.canon.contains("- not camelcase") || c.canon.contains("not camelcase"));
}
#[test]
fn snake_case_identifier_survives_stemming() {
let c = canonicalize("Always use snake_case for variables.");
assert!(c.stems.contains(&"snake_case".to_string()));
}
#[test]
fn camelcase_lowercased_and_stemmed() {
let c = canonicalize("Always use camelCase for variables.");
assert!(c.canon.contains("camelcase"));
}
#[test]
fn whitespace_collapsed() {
let c = canonicalize("Use \t pnpm, not npm.");
assert!(!c.canon.contains(" "));
}
#[test]
fn deterministic() {
let a = canonicalize("Use camelCase for variables.");
let b = canonicalize("Use camelCase for variables.");
assert_eq!(a.canon, b.canon);
assert_eq!(a.stems, b.stems);
}
}