Skip to main content

simd_normalizer/
lib.rs

1//! simd-normalizer -- SIMD-accelerated Unicode normalization.
2//!
3//! Provides NFC, NFD, NFKC, NFKD normalization with a single-pass
4//! SIMD-guided architecture.  The core is `no_std + alloc`; enable
5//! the `std` feature for runtime CPU dispatch.
6
7#![no_std]
8#![warn(missing_docs)]
9
10extern crate alloc;
11
12#[cfg(feature = "std")]
13extern crate std;
14
15use alloc::borrow::Cow;
16
17pub mod casefold;
18mod ccc;
19mod compose;
20pub mod confusable;
21mod decompose;
22mod hangul;
23pub mod matching;
24pub mod normalizer;
25mod quick_check;
26#[cfg(not(any(test, feature = "internal-test-api")))]
27pub(crate) mod simd;
28
29#[cfg(any(test, feature = "internal-test-api"))]
30#[doc(hidden)]
31pub mod simd;
32mod tables;
33mod utf8;
34
35#[cfg(any(test, feature = "internal-test-api"))]
36pub mod tables_ext;
37
38/// Crate-private SIMD wrappers re-exported for integration tests.
39/// Not for downstream use; semver-exempt; tracks `simd::scan_chunk*`
40/// signatures exactly.
41#[cfg(any(test, feature = "internal-test-api"))]
42#[allow(rustdoc::private_intra_doc_links)]
43pub mod simd_test_api {
44    /// See [`crate::simd::scan_chunk`].
45    /// # Safety
46    /// `ptr` must be valid for 64 bytes of read access.
47    #[inline]
48    pub unsafe fn scan_chunk(ptr: *const u8, bound: u8) -> u64 {
49        unsafe { crate::simd::scan_chunk(ptr, bound) }
50    }
51
52    /// Direct NEON `scan_chunk` for cross-vtable consistency tests on
53    /// aarch64. Mirrors the dispatched signature.
54    /// # Safety
55    /// `ptr` must be valid for 64 bytes of read access.
56    #[cfg(target_arch = "aarch64")]
57    #[inline]
58    pub unsafe fn neon_scan_chunk(ptr: *const u8, bound: u8) -> u64 {
59        unsafe { crate::simd::aarch64::neon::scan_chunk(ptr, bound) }
60    }
61
62    /// Direct SVE2 `scan_chunk` for cross-vtable consistency tests on
63    /// aarch64. Mirrors the dispatched signature. The caller MUST verify
64    /// `is_aarch64_feature_detected!("sve2")` before invoking this on a
65    /// std target — calling it on a host without SVE2 is undefined
66    /// behaviour (SIGILL).
67    /// # Safety
68    /// `ptr` must be valid for 64 bytes of read access AND the host must
69    /// support SVE2.
70    #[cfg(target_arch = "aarch64")]
71    #[inline]
72    pub unsafe fn sve2_scan_chunk(ptr: *const u8, bound: u8) -> u64 {
73        unsafe { crate::simd::aarch64::sve2::scan_chunk(ptr, bound) }
74    }
75}
76
77pub use casefold::{CaseFoldMode, casefold, casefold_char};
78pub use confusable::{are_confusable, skeleton};
79#[cfg(any(test, feature = "internal-test-api"))]
80pub use matching::normalize_for_matching_legacy;
81pub use matching::{
82    MatchingOptions, matches_normalized, normalize_for_matching, normalize_for_matching_utf16,
83};
84pub use normalizer::{NfcNormalizer, NfdNormalizer, NfkcNormalizer, NfkdNormalizer};
85pub use quick_check::IsNormalized;
86
87#[cfg(feature = "quick_check_oracle")]
88pub use crate::quick_check::{
89    quick_check_nfc, quick_check_nfc_oracle, quick_check_nfd, quick_check_nfd_oracle,
90    quick_check_nfkc, quick_check_nfkc_oracle, quick_check_nfkd, quick_check_nfkd_oracle,
91};
92
93/// Return a pre-built NFC normalizer.
94#[inline]
95pub fn nfc() -> NfcNormalizer {
96    NfcNormalizer::new()
97}
98
99/// Return a pre-built NFD normalizer.
100#[inline]
101pub fn nfd() -> NfdNormalizer {
102    NfdNormalizer::new()
103}
104
105/// Return a pre-built NFKC normalizer.
106#[inline]
107pub fn nfkc() -> NfkcNormalizer {
108    NfkcNormalizer::new()
109}
110
111/// Return a pre-built NFKD normalizer.
112#[inline]
113pub fn nfkd() -> NfkdNormalizer {
114    NfkdNormalizer::new()
115}
116
117/// Convenience trait for normalizing `&str` slices.
118///
119/// All methods return `Cow<'_, str>`, which is `Cow::Borrowed` when the input
120/// is already in the target normalization form (zero allocation).
121pub trait UnicodeNormalization {
122    /// Normalize to NFC (Canonical Decomposition, followed by Canonical Composition).
123    fn nfc(&self) -> Cow<'_, str>;
124    /// Normalize to NFD (Canonical Decomposition).
125    fn nfd(&self) -> Cow<'_, str>;
126    /// Normalize to NFKC (Compatibility Decomposition, followed by Canonical Composition).
127    fn nfkc(&self) -> Cow<'_, str>;
128    /// Normalize to NFKD (Compatibility Decomposition).
129    fn nfkd(&self) -> Cow<'_, str>;
130    /// Check whether the string is already in NFC.
131    fn is_nfc(&self) -> bool;
132    /// Check whether the string is already in NFD.
133    fn is_nfd(&self) -> bool;
134    /// Check whether the string is already in NFKC.
135    fn is_nfkc(&self) -> bool;
136    /// Check whether the string is already in NFKD.
137    fn is_nfkd(&self) -> bool;
138}
139
140impl UnicodeNormalization for str {
141    #[inline]
142    fn nfc(&self) -> Cow<'_, str> {
143        crate::nfc().normalize(self)
144    }
145    #[inline]
146    fn nfd(&self) -> Cow<'_, str> {
147        crate::nfd().normalize(self)
148    }
149    #[inline]
150    fn nfkc(&self) -> Cow<'_, str> {
151        crate::nfkc().normalize(self)
152    }
153    #[inline]
154    fn nfkd(&self) -> Cow<'_, str> {
155        crate::nfkd().normalize(self)
156    }
157    #[inline]
158    fn is_nfc(&self) -> bool {
159        crate::nfc().is_normalized(self)
160    }
161    #[inline]
162    fn is_nfd(&self) -> bool {
163        crate::nfd().is_normalized(self)
164    }
165    #[inline]
166    fn is_nfkc(&self) -> bool {
167        crate::nfkc().is_normalized(self)
168    }
169    #[inline]
170    fn is_nfkd(&self) -> bool {
171        crate::nfkd().is_normalized(self)
172    }
173}