utf8proc 0.1.2

Rust bindings to the utf8proc library
Documentation
//! Normalization tests.
//!
//! Relies on test files from the `unicode-normalization` crate.

use anyhow::{Context, anyhow};
use camino::Utf8PathBuf;
use indexmap::IndexMap;
use once_cell::sync::OnceCell;
use similar_asserts::assert_eq;
use utf8proc::transform::UnicodeNormalizationForm;

struct NormalizationTestEntry {
    pub line_number: u64,
    source: String,
    forms: IndexMap<UnicodeNormalizationForm, String>,
}

/// Forms in the order they are serialized
const FORMS: &[UnicodeNormalizationForm] = &[
    UnicodeNormalizationForm::NFC,
    UnicodeNormalizationForm::NFD,
    UnicodeNormalizationForm::NFKC,
    UnicodeNormalizationForm::NFKD,
];

fn load_normalization_tests() -> anyhow::Result<&'static [NormalizationTestEntry]> {
    /// Cache the result to avoid redundant IO.
    static NORMALIZATION_TESTS: OnceCell<Vec<NormalizationTestEntry>> = OnceCell::new();
    NORMALIZATION_TESTS
        .get_or_try_init(|| {
            let root_dir = Utf8PathBuf::from(std::env::var("CARGO_MANIFEST_DIR")?);
            let unicode_dir = root_dir.join("data/unicode/current");
            let normalization_test_data = unicode_dir.join("NormalizationTest.txt");
            let text = std::fs::read_to_string(&normalization_test_data)
                .with_context(|| format!("Failed to read from {normalization_test_data} (missing?)"))?;
            let mut res = Vec::new();

            for (line_number, line) in text.lines().enumerate() {
                let line_number = line_number as u64;
                if line.starts_with(['#', '@']) {
                    continue;
                }
                let parts = line.split(";").collect::<Vec<&str>>();
                assert!(parts.len() >= 5, "Not enough parts on {line_number}");
                fn parse_part(part: &str) -> anyhow::Result<String> {
                    let mut res = String::new();
                    for hex_entry in part.split_whitespace() {
                        let c = u32::from_str_radix(hex_entry, 16)
                            .ok()
                            .and_then(char::from_u32)
                            .ok_or_else(|| anyhow!("Not a valid character: {hex_entry:?}"))?;
                        res.push(c);
                    }
                    Ok(res)
                }
                res.push(NormalizationTestEntry {
                    line_number,
                    source: parse_part(parts[0])
                        .with_context(|| format!("Invalid source entry on line {line_number}"))?,
                    forms: FORMS
                        .iter()
                        .enumerate()
                        .map(|(offset, &form)| {
                            let part = parts[1 + offset];
                            Ok((
                                form,
                                parse_part(part)
                                    .with_context(|| format!("Invalid {form} entry on line {line_number}"))?,
                            ))
                        })
                        .collect::<Result<IndexMap<_, _>, anyhow::Error>>()?,
                })
            }
            Ok(res)
        })
        .map(Vec::as_slice)
}

#[test]
fn normalization_ascii() {
    assert_eq!(
        utf8proc::transform::normalize("abc", UnicodeNormalizationForm::NFD,).unwrap(),
        "abc"
    )
}

macro_rules! form_tests {
    ($($form:ident),*) => (paste::paste! {
        $(
            #[test]
            fn [<$form:lower _normalization>]() {
                const FORM: UnicodeNormalizationForm = UnicodeNormalizationForm::$form;
                for test in load_normalization_tests().unwrap() {
                    let src = &test.source;
                    let context = format!("{FORM} normalizing {src:?} (line {})", test.line_number);
                    let result = utf8proc::transform::normalize(&src, FORM)
                        .with_context(|| context.to_string())
                        .unwrap();
                    assert_eq!(
                        result,
                        test.forms[&FORM],
                        "{context}",
                    );
                }
            }
        )*
    })
}

form_tests!(NFC, NFD, NFKC, NFKD);

/*
#[test]
pub fn nfd_normalization() {

}

#[test]
pub fn nfc_normalization() {
    for test in data::NORMALIZATION_TESTS {
        assert_eq!(
            utf8proc::transform::normalize(test.source, UnicodeNormalizationForm::NFC,).unwrap(),
            test.nfc,
            "NFC normalization",
        );
    }
}

#[test]
pub fn nfkc_normalization() {
    for test in data::NORMALIZATION_TESTS {
        assert_eq!(
            utf8proc::transform::normalize(test.source, UnicodeNormalizationForm::NFKC,).unwrap(),
            test.nfkc,
            "NFKC normalization",
        );
    }
}
*/