use alloc::string::String;
use unicode_normalization::UnicodeNormalization;
mod bidi;
mod casefold;
#[inline]
fn should_drop(c: char, drop_bidi: bool, drop_fmt: bool) -> bool {
(drop_bidi && bidi::is_bidi_control(c)) || (drop_fmt && bidi::is_format(c))
}
#[cfg(feature = "security")]
#[cfg_attr(docsrs, doc(cfg(feature = "security")))]
mod confusable;
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
pub enum Normalization {
Nfc,
Nfkc,
None,
}
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
pub enum CaseFold {
None,
Simple,
}
#[derive(Clone, Debug)]
pub struct CanonicalizerBuilder {
pub normalization: Normalization,
pub case_fold: CaseFold,
pub strip_bidi: bool,
pub strip_format: bool,
pub apply_confusable: bool,
}
impl Default for CanonicalizerBuilder {
fn default() -> Self {
Self {
normalization: Normalization::Nfkc,
case_fold: CaseFold::Simple,
strip_bidi: true,
strip_format: true,
apply_confusable: false,
}
}
}
impl CanonicalizerBuilder {
#[inline]
#[must_use]
pub fn build(self) -> Canonicalizer {
Canonicalizer { cfg: self }
}
}
#[derive(Clone, Debug)]
pub struct Canonicalizer {
cfg: CanonicalizerBuilder,
}
impl Default for Canonicalizer {
#[inline]
fn default() -> Self {
CanonicalizerBuilder::default().build()
}
}
impl Canonicalizer {
#[inline]
#[must_use]
pub fn new(builder: CanonicalizerBuilder) -> Self {
builder.build()
}
#[inline]
#[must_use]
pub fn config(&self) -> &CanonicalizerBuilder {
&self.cfg
}
#[must_use]
pub fn canonicalize(&self, input: &str) -> String {
if self.is_default_pipeline() {
if input.is_ascii() {
return input.to_ascii_lowercase();
}
if input
.chars()
.all(|c| c.is_ascii() || bidi::is_bidi_control(c) || bidi::is_format(c))
{
let mut out = String::with_capacity(input.len());
for c in input.chars() {
if c.is_ascii() {
out.push(c.to_ascii_lowercase());
}
}
return out;
}
}
let drop_bidi = self.cfg.strip_bidi;
let drop_fmt = self.cfg.strip_format;
let cap = input.len() + (input.len() >> 4);
let stripped = input
.chars()
.filter(|&c| !should_drop(c, drop_bidi, drop_fmt));
let mut buf = String::with_capacity(cap);
match self.cfg.normalization {
Normalization::Nfkc => buf.extend(stripped.nfkc()),
Normalization::Nfc => buf.extend(stripped.nfc()),
Normalization::None => buf.extend(stripped),
}
if matches!(self.cfg.case_fold, CaseFold::Simple) {
buf = casefold::simple(&buf);
buf = match self.cfg.normalization {
Normalization::Nfkc => buf.nfkc().collect(),
Normalization::Nfc => buf.nfc().collect(),
Normalization::None => buf,
};
}
#[cfg(feature = "security")]
{
if self.cfg.apply_confusable {
buf = confusable::skeleton(&buf);
}
}
#[cfg(not(feature = "security"))]
{
let _ = self.cfg.apply_confusable;
}
buf
}
#[inline]
fn is_default_pipeline(&self) -> bool {
matches!(self.cfg.normalization, Normalization::Nfkc)
&& matches!(self.cfg.case_fold, CaseFold::Simple)
&& self.cfg.strip_bidi
&& self.cfg.strip_format
&& !self.cfg.apply_confusable
}
#[must_use]
pub fn config_string(&self) -> String {
let mut s = String::with_capacity(32);
s.push_str(match self.cfg.normalization {
Normalization::Nfc => "nfc",
Normalization::Nfkc => "nfkc",
Normalization::None => "none",
});
s.push('-');
s.push_str(match self.cfg.case_fold {
CaseFold::Simple => "cf-simple",
CaseFold::None => "cf-none",
});
if self.cfg.strip_bidi {
s.push_str("-bidi");
}
if self.cfg.strip_format {
s.push_str("-fmt");
}
if self.cfg.apply_confusable {
s.push_str("-conf");
}
s
}
}
#[inline]
#[must_use]
pub fn canonicalize(input: &str) -> String {
Canonicalizer::default().canonicalize(input)
}
#[cfg(test)]
mod tests {
use super::*;
use alloc::string::ToString;
#[test]
fn default_lowercases_and_strips_zwsp() {
let c = Canonicalizer::default();
assert_eq!(c.canonicalize("Hello\u{200B}World"), "helloworld");
}
#[test]
fn nfkc_collapses_full_width() {
let c = Canonicalizer::default();
assert_eq!(c.canonicalize("ABC"), "abc");
}
#[test]
fn nfkc_collapses_ligature() {
let c = Canonicalizer::default();
assert_eq!(c.canonicalize("file"), "file");
}
#[test]
fn idempotence() {
let c = Canonicalizer::default();
let a = c.canonicalize("Façade — Test\u{202E}rev\u{200B}");
let b = c.canonicalize(&a);
assert_eq!(a, b);
}
#[test]
fn config_string_is_stable() {
let c = Canonicalizer::default();
assert_eq!(c.config_string(), "nfkc-cf-simple-bidi-fmt");
}
#[test]
fn convenience_function_matches_default() {
let direct = canonicalize("Mixed CASE");
let viaobj = Canonicalizer::default().canonicalize("Mixed CASE");
assert_eq!(direct, viaobj);
}
#[test]
fn none_normalization_passes_through() {
let c = CanonicalizerBuilder {
normalization: Normalization::None,
case_fold: CaseFold::None,
strip_bidi: false,
strip_format: false,
apply_confusable: false,
}
.build();
assert_eq!(c.canonicalize("HéLLo"), "HéLLo");
}
#[test]
fn bidi_strip_kills_rlo() {
let c = Canonicalizer::default();
let s = c.canonicalize("admin\u{202E}gnirts");
assert!(!s.contains('\u{202E}'));
}
#[test]
fn casefold_does_not_use_turkish_locale() {
let c = Canonicalizer::default();
let folded = c.canonicalize("İ");
assert!(folded.contains('i'));
assert!(!folded.contains('ı'), "got: {folded:?}");
}
#[test]
fn config_string_reflects_overrides() {
let c = CanonicalizerBuilder {
normalization: Normalization::Nfc,
case_fold: CaseFold::None,
strip_bidi: false,
strip_format: true,
apply_confusable: false,
}
.build();
assert_eq!(c.config_string(), "nfc-cf-none-fmt");
}
#[test]
fn canonicalizer_is_send_sync() {
fn assert_traits<T: Send + Sync>() {}
assert_traits::<Canonicalizer>();
}
#[test]
fn empty_input_yields_empty_output() {
assert_eq!(canonicalize(""), "");
}
#[test]
fn variation_selector_is_stripped() {
let c = Canonicalizer::default();
assert_eq!(c.canonicalize("a\u{FE0F}"), "a");
}
#[test]
fn idempotence_on_arabic() {
let c = Canonicalizer::default();
let a = c.canonicalize("الْعَرَبِيَّة");
assert_eq!(c.canonicalize(&a), a);
}
#[test]
fn idempotence_with_expanding_casefold_before_combining_mark() {
let c = Canonicalizer::default();
let input = "İ\u{329}";
let a = c.canonicalize(input);
assert_eq!(c.canonicalize(&a), a);
}
#[test]
fn idempotence_under_normalization_none_with_casefold() {
let c = CanonicalizerBuilder {
normalization: Normalization::None,
..Default::default()
}
.build();
for input in &[
"İ\u{329}",
"\u{6e4}\u{202a}\u{6e4}\u{6ea}-\u{2}\u{3}",
"\n(\u{b}462ljİ\u{329}",
] {
let a = c.canonicalize(input);
assert_eq!(c.canonicalize(&a), a, "input = {input:?}");
}
}
#[cfg(feature = "security")]
#[test]
fn idempotence_with_confusable_skeleton() {
let c = CanonicalizerBuilder {
apply_confusable: true,
..Default::default()
}
.build();
for input in &[
"café", "naïveté", "раураl", "İ\u{329}", "\u{6e4}\u{202a}\u{6e4}\u{6ea}-\u{2}\u{3}",
] {
let a = c.canonicalize(input);
assert_eq!(c.canonicalize(&a), a, "input = {input:?}");
}
}
#[test]
fn idempotence_with_format_char_between_combining_marks() {
let c = Canonicalizer::default();
let input = "\u{6e4}\u{202a}\u{6e4}\u{6ea}-\u{2}\u{3}";
let a = c.canonicalize(input);
assert_eq!(c.canonicalize(&a), a);
}
#[test]
fn config_round_trip_via_to_string() {
let s = Canonicalizer::default().config_string();
let _: String = s.to_string();
}
}