Skip to main content

simd_normalizer/
lib.rs

1//! simd-normalizer -- SIMD-accelerated Unicode normalization.
2//!
3//! Provides NFC, NFD, NFKC, NFKD normalization with a single-pass
4//! SIMD-guided architecture.  The core is `no_std + alloc`; enable
5//! the `std` feature for runtime CPU dispatch.
6
7#![no_std]
8#![warn(missing_docs)]
9
10extern crate alloc;
11
12#[cfg(feature = "std")]
13extern crate std;
14
15use alloc::borrow::Cow;
16
17pub mod casefold;
18mod ccc;
19mod compose;
20pub mod confusable;
21mod decompose;
22mod hangul;
23pub mod matching;
24pub mod normalizer;
25mod quick_check;
26#[cfg(not(any(test, feature = "internal-test-api")))]
27pub(crate) mod simd;
28
29#[cfg(any(test, feature = "internal-test-api"))]
30#[doc(hidden)]
31pub mod simd;
32mod tables;
33mod utf8;
34
35#[cfg(any(test, feature = "internal-test-api"))]
36pub mod tables_ext;
37
38/// Crate-private SIMD wrappers re-exported for integration tests.
39/// Not for downstream use; semver-exempt; tracks `simd::scan_chunk*`
40/// signatures exactly.
41#[cfg(any(test, feature = "internal-test-api"))]
42#[allow(rustdoc::private_intra_doc_links)]
43pub mod simd_test_api {
44    /// See [`crate::simd::scan_chunk`].
45    /// # Safety
46    /// `ptr` must be valid for 64 bytes of read access.
47    #[inline]
48    pub unsafe fn scan_chunk(ptr: *const u8, bound: u8) -> u64 {
49        unsafe { crate::simd::scan_chunk(ptr, bound) }
50    }
51}
52
53pub use casefold::{CaseFoldMode, casefold, casefold_char};
54pub use confusable::{are_confusable, skeleton};
55#[cfg(any(test, feature = "internal-test-api"))]
56pub use matching::normalize_for_matching_legacy;
57pub use matching::{
58    MatchingOptions, matches_normalized, normalize_for_matching, normalize_for_matching_utf16,
59};
60pub use normalizer::{NfcNormalizer, NfdNormalizer, NfkcNormalizer, NfkdNormalizer};
61pub use quick_check::IsNormalized;
62
63#[cfg(feature = "quick_check_oracle")]
64pub use crate::quick_check::{
65    quick_check_nfc, quick_check_nfc_oracle, quick_check_nfd, quick_check_nfd_oracle,
66    quick_check_nfkc, quick_check_nfkc_oracle, quick_check_nfkd, quick_check_nfkd_oracle,
67};
68
69/// Return a pre-built NFC normalizer.
70#[inline]
71pub fn nfc() -> NfcNormalizer {
72    NfcNormalizer::new()
73}
74
75/// Return a pre-built NFD normalizer.
76#[inline]
77pub fn nfd() -> NfdNormalizer {
78    NfdNormalizer::new()
79}
80
81/// Return a pre-built NFKC normalizer.
82#[inline]
83pub fn nfkc() -> NfkcNormalizer {
84    NfkcNormalizer::new()
85}
86
87/// Return a pre-built NFKD normalizer.
88#[inline]
89pub fn nfkd() -> NfkdNormalizer {
90    NfkdNormalizer::new()
91}
92
93/// Convenience trait for normalizing `&str` slices.
94///
95/// All methods return `Cow<'_, str>`, which is `Cow::Borrowed` when the input
96/// is already in the target normalization form (zero allocation).
97pub trait UnicodeNormalization {
98    /// Normalize to NFC (Canonical Decomposition, followed by Canonical Composition).
99    fn nfc(&self) -> Cow<'_, str>;
100    /// Normalize to NFD (Canonical Decomposition).
101    fn nfd(&self) -> Cow<'_, str>;
102    /// Normalize to NFKC (Compatibility Decomposition, followed by Canonical Composition).
103    fn nfkc(&self) -> Cow<'_, str>;
104    /// Normalize to NFKD (Compatibility Decomposition).
105    fn nfkd(&self) -> Cow<'_, str>;
106    /// Check whether the string is already in NFC.
107    fn is_nfc(&self) -> bool;
108    /// Check whether the string is already in NFD.
109    fn is_nfd(&self) -> bool;
110    /// Check whether the string is already in NFKC.
111    fn is_nfkc(&self) -> bool;
112    /// Check whether the string is already in NFKD.
113    fn is_nfkd(&self) -> bool;
114}
115
116impl UnicodeNormalization for str {
117    #[inline]
118    fn nfc(&self) -> Cow<'_, str> {
119        crate::nfc().normalize(self)
120    }
121    #[inline]
122    fn nfd(&self) -> Cow<'_, str> {
123        crate::nfd().normalize(self)
124    }
125    #[inline]
126    fn nfkc(&self) -> Cow<'_, str> {
127        crate::nfkc().normalize(self)
128    }
129    #[inline]
130    fn nfkd(&self) -> Cow<'_, str> {
131        crate::nfkd().normalize(self)
132    }
133    #[inline]
134    fn is_nfc(&self) -> bool {
135        crate::nfc().is_normalized(self)
136    }
137    #[inline]
138    fn is_nfd(&self) -> bool {
139        crate::nfd().is_normalized(self)
140    }
141    #[inline]
142    fn is_nfkc(&self) -> bool {
143        crate::nfkc().is_normalized(self)
144    }
145    #[inline]
146    fn is_nfkd(&self) -> bool {
147        crate::nfkd().is_normalized(self)
148    }
149}