use anyhow::{Context, anyhow};
use camino::Utf8PathBuf;
use indexmap::IndexMap;
use once_cell::sync::OnceCell;
use similar_asserts::assert_eq;
use utf8proc::transform::UnicodeNormalizationForm;
struct NormalizationTestEntry {
pub line_number: u64,
source: String,
forms: IndexMap<UnicodeNormalizationForm, String>,
}
const FORMS: &[UnicodeNormalizationForm] = &[
UnicodeNormalizationForm::NFC,
UnicodeNormalizationForm::NFD,
UnicodeNormalizationForm::NFKC,
UnicodeNormalizationForm::NFKD,
];
fn load_normalization_tests() -> anyhow::Result<&'static [NormalizationTestEntry]> {
static NORMALIZATION_TESTS: OnceCell<Vec<NormalizationTestEntry>> = OnceCell::new();
NORMALIZATION_TESTS
.get_or_try_init(|| {
let root_dir = Utf8PathBuf::from(std::env::var("CARGO_MANIFEST_DIR")?);
let unicode_dir = root_dir.join("data/unicode/current");
let normalization_test_data = unicode_dir.join("NormalizationTest.txt");
let text = std::fs::read_to_string(&normalization_test_data)
.with_context(|| format!("Failed to read from {normalization_test_data} (missing?)"))?;
let mut res = Vec::new();
for (line_number, line) in text.lines().enumerate() {
let line_number = line_number as u64;
if line.starts_with(['#', '@']) {
continue;
}
let parts = line.split(";").collect::<Vec<&str>>();
assert!(parts.len() >= 5, "Not enough parts on {line_number}");
fn parse_part(part: &str) -> anyhow::Result<String> {
let mut res = String::new();
for hex_entry in part.split_whitespace() {
let c = u32::from_str_radix(hex_entry, 16)
.ok()
.and_then(char::from_u32)
.ok_or_else(|| anyhow!("Not a valid character: {hex_entry:?}"))?;
res.push(c);
}
Ok(res)
}
res.push(NormalizationTestEntry {
line_number,
source: parse_part(parts[0])
.with_context(|| format!("Invalid source entry on line {line_number}"))?,
forms: FORMS
.iter()
.enumerate()
.map(|(offset, &form)| {
let part = parts[1 + offset];
Ok((
form,
parse_part(part)
.with_context(|| format!("Invalid {form} entry on line {line_number}"))?,
))
})
.collect::<Result<IndexMap<_, _>, anyhow::Error>>()?,
})
}
Ok(res)
})
.map(Vec::as_slice)
}
#[test]
fn normalization_ascii() {
assert_eq!(
utf8proc::transform::normalize("abc", UnicodeNormalizationForm::NFD,).unwrap(),
"abc"
)
}
macro_rules! form_tests {
($($form:ident),*) => (paste::paste! {
$(
#[test]
fn [<$form:lower _normalization>]() {
const FORM: UnicodeNormalizationForm = UnicodeNormalizationForm::$form;
for test in load_normalization_tests().unwrap() {
let src = &test.source;
let context = format!("{FORM} normalizing {src:?} (line {})", test.line_number);
let result = utf8proc::transform::normalize(&src, FORM)
.with_context(|| context.to_string())
.unwrap();
assert_eq!(
result,
test.forms[&FORM],
"{context}",
);
}
}
)*
})
}
form_tests!(NFC, NFD, NFKC, NFKD);