disarm 0.10.0

Unicode canonicalization and TR39 confusable analysis: building blocks for text-security pipelines (homoglyph/bidi/zalgo handling) plus standards-based transliteration
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
//! Layer 1 (pure-Rust core): full Unicode case folding. No pyo3.
//!
//! Shim in `src/py/case_fold.rs`; crates.io surface is `crate::api::fold_case`.

use crate::tables::case_folding_data;

// disarm does not cap input size — bounding untrusted input is the caller's
// responsibility (case folding is linear time/memory; see #80).

/// Full Unicode case folding per CaseFolding.txt (status C + F).
///
/// Unlike `str.lower()` / `char::to_lowercase()`, this performs *full* case
/// folding: ß→ss, İ→i̇, fi→fi, µ→μ, ſ→s, ς→σ, and ~1,500 other mappings
/// including Cherokee, Adlam, and all ligature expansions.
///
/// Fast paths:
/// 1. Pure-ASCII bypass — if the entire string is ASCII, use branchless
///    bitwise lowercasing with no PHF lookup.
/// 2. Per-character ASCII check — uppercase A-Z are lowered inline.
/// 3. PHF lookup — O(1) for all 1,557 Unicode case folding entries.
/// 4. Identity fallback — characters not in the table map to themselves.
pub(crate) fn fold_case_impl(text: &str) -> String {
    let mut out = String::new();
    fold_case_into(text, &mut out);
    out
}

/// Borrowing form of [`fold_case_impl`] (#352): returns `Cow::Borrowed` when
/// `text` is already fully case-folded (no ASCII uppercase and no character with
/// a folding-table entry), so the no-op case never allocates.
pub(crate) fn fold_case_cow(text: &str) -> std::borrow::Cow<'_, str> {
    use std::borrow::Cow;
    let changes = text.chars().any(|ch| {
        ch.is_ascii_uppercase() || (!ch.is_ascii() && case_folding_data::lookup(ch).is_some())
    });
    if changes {
        Cow::Owned(fold_case_impl(text))
    } else {
        Cow::Borrowed(text)
    }
}

/// In-place form of [`fold_case_impl`] writing into `result` (cleared first),
/// so the pipeline can reuse one buffer across steps (#236 item 7).
pub(crate) fn fold_case_into(text: &str, result: &mut String) {
    result.clear();
    // Fast path: pure ASCII — branchless bulk lowering, no heap probe.
    if text.is_ascii() {
        result.push_str(text);
        result.make_ascii_lowercase();
        return;
    }

    // Over-allocate by 10% to reduce reallocations when expanding chars
    // are present (e.g. ß→ss, ffi→ffi).  For pure non-expanding input the
    // excess is negligible; for expansion-heavy input it avoids 1–2 reallocs.
    result.reserve(text.len() + text.len() / 10);

    for ch in text.chars() {
        if ch.is_ascii() {
            // ASCII lowercase — no PHF lookup needed.
            result.push(ch.to_ascii_lowercase());
        } else if let Some(folded) = case_folding_data::lookup(ch) {
            result.push_str(folded);
        } else {
            // Not in case folding table → maps to itself.
            result.push(ch);
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    // ── ASCII fast path ─────────────────────────────────────────────

    #[test]
    fn test_fold_case_basic() {
        assert_eq!(fold_case_impl("Hello"), "hello");
        assert_eq!(fold_case_impl("Straße"), "strasse");
    }

    #[test]
    fn test_fold_case_ascii_fast_path() {
        assert_eq!(fold_case_impl("HELLO WORLD"), "hello world");
        assert_eq!(fold_case_impl("already lowercase"), "already lowercase");
        assert_eq!(fold_case_impl("MiXeD CaSe 123!"), "mixed case 123!");
    }

    #[test]
    fn test_fold_case_pure_ascii_digits_and_punctuation() {
        // Digits and punctuation pass through unchanged.
        assert_eq!(fold_case_impl("12345!@#$%"), "12345!@#$%");
        assert_eq!(fold_case_impl("foo_bar-baz.qux"), "foo_bar-baz.qux");
    }

    #[test]
    fn test_fold_case_empty_string() {
        assert_eq!(fold_case_impl(""), "");
    }

    #[test]
    fn test_fold_case_single_ascii_char() {
        assert_eq!(fold_case_impl("A"), "a");
        assert_eq!(fold_case_impl("z"), "z");
        assert_eq!(fold_case_impl("7"), "7");
    }

    // ── Latin ligatures ─────────────────────────────────────────────

    #[test]
    fn test_fold_case_ligatures() {
        assert_eq!(fold_case_impl("find flat ff ffi ffl"), "find flat ff ffi ffl");
        assert_eq!(fold_case_impl("ſtop stop"), "stop stop");
    }

    // ── Latin Extended: characters where fold != lower ────────────

    #[test]
    fn test_fold_case_micro_sign_to_greek_mu() {
        // µ (U+00B5 micro sign) → μ (U+03BC Greek small mu)
        assert_eq!(fold_case_impl("\u{00B5}"), "\u{03BC}");
    }

    #[test]
    fn test_fold_case_long_s_to_s() {
        // ſ (U+017F long s) → s
        assert_eq!(fold_case_impl("\u{017F}"), "s");
    }

    #[test]
    fn test_fold_case_eszett() {
        // ß (U+00DF) → ss
        assert_eq!(fold_case_impl("ß"), "ss");
        // ẞ (U+1E9E capital eszett) → ss
        assert_eq!(fold_case_impl(""), "ss");
    }

    #[test]
    fn test_fold_case_dotted_i() {
        // İ (U+0130) → i + combining dot above (U+0307)
        assert_eq!(fold_case_impl("\u{0130}"), "i\u{0307}");
    }

    // ── Greek ────────────────────────────────────────────────────────

    #[test]
    fn test_fold_case_greek_uppercase() {
        assert_eq!(fold_case_impl("ΑΒΓΔ"), "αβγδ");
        assert_eq!(fold_case_impl("ΩΨΧΦ"), "ωψχφ");
    }

    #[test]
    fn test_fold_case_greek_final_sigma() {
        // ς (U+03C2 final sigma) → σ (U+03C3)
        assert_eq!(fold_case_impl("\u{03C2}"), "\u{03C3}");
    }

    #[test]
    fn test_fold_case_greek_variant_forms() {
        // ϐ (U+03D0 beta symbol) → β
        assert_eq!(fold_case_impl("\u{03D0}"), "\u{03B2}");
        // ϑ (U+03D1 theta symbol) → θ
        assert_eq!(fold_case_impl("\u{03D1}"), "\u{03B8}");
        // ϕ (U+03D5 phi symbol) → φ
        assert_eq!(fold_case_impl("\u{03D5}"), "\u{03C6}");
        // ϖ (U+03D6 pi symbol) → π
        assert_eq!(fold_case_impl("\u{03D6}"), "\u{03C0}");
        // ϰ (U+03F0 kappa symbol) → κ
        assert_eq!(fold_case_impl("\u{03F0}"), "\u{03BA}");
        // ϱ (U+03F1 rho symbol) → ρ
        assert_eq!(fold_case_impl("\u{03F1}"), "\u{03C1}");
    }

    #[test]
    fn test_fold_case_greek_with_tonos() {
        // ΐ (U+0390) → ΐ decomposed: ι + combining diaeresis + combining acute
        assert_eq!(fold_case_impl("\u{0390}"), "\u{03B9}\u{0308}\u{0301}");
    }

    // ── Cyrillic ─────────────────────────────────────────────────────

    #[test]
    fn test_fold_case_cyrillic_uppercase() {
        assert_eq!(fold_case_impl("АБВГД"), "абвгд");
        assert_eq!(fold_case_impl("ЭЮЯЪ"), "эюяъ");
    }

    #[test]
    fn test_fold_case_cyrillic_mixed() {
        assert_eq!(fold_case_impl("Москва"), "москва");
        assert_eq!(fold_case_impl("КИЇВ"), "київ");
    }

    // ── Armenian ─────────────────────────────────────────────────────

    #[test]
    fn test_fold_case_armenian() {
        // Ա (U+0531) → ա (U+0561)
        assert_eq!(fold_case_impl("\u{0531}"), "\u{0561}");
        // Armenian ligature և (U+0587) → եւ
        assert_eq!(fold_case_impl("\u{0587}"), "\u{0565}\u{0582}");
    }

    // ── Georgian ─────────────────────────────────────────────────────

    #[test]
    fn test_fold_case_georgian_mtavruli() {
        // Mtavruli Ა (U+1C90) → ა (U+10D0)
        assert_eq!(fold_case_impl("\u{1C90}"), "\u{10D0}");
    }

    // ── Cherokee ─────────────────────────────────────────────────────

    #[test]
    fn test_fold_case_cherokee() {
        // Cherokee is unusual: CaseFolding.txt maps the *small* forms
        // (U+AB70–U+ABBF) to the original uppercase forms (U+13A0–U+13EF).
        // The uppercase forms themselves have no folding entry → identity.
        assert_eq!(fold_case_impl("\u{13A0}"), "\u{13A0}"); // Ꭰ stays Ꭰ
                                                            // Small ꭰ (U+AB70) folds to Ꭰ (U+13A0)
        assert_eq!(fold_case_impl("\u{AB70}"), "\u{13A0}");
        assert_eq!(fold_case_impl("\u{AB71}"), "\u{13A1}");
    }

    // ── Adlam ────────────────────────────────────────────────────────

    #[test]
    fn test_fold_case_adlam() {
        // Adlam capital 𞤀 (U+1E900) → small 𞤢 (U+1E922)
        assert_eq!(fold_case_impl("\u{1E900}"), "\u{1E922}");
        // Adlam capital 𞤁 (U+1E901) → small 𞤣 (U+1E923)
        assert_eq!(fold_case_impl("\u{1E901}"), "\u{1E923}");
    }

    // ── Fullwidth Latin ──────────────────────────────────────────────

    #[test]
    fn test_fold_case_fullwidth_latin() {
        // A (U+FF21) → a (U+FF41)
        assert_eq!(fold_case_impl("\u{FF21}"), "\u{FF41}");
        // Z (U+FF3A) → z (U+FF5A)
        assert_eq!(fold_case_impl("\u{FF3A}"), "\u{FF5A}");
    }

    // ── Mixed-script strings ─────────────────────────────────────────

    #[test]
    fn test_fold_case_mixed_scripts() {
        assert_eq!(fold_case_impl("Café ΣΟΦΙΑ"), "café σοφια");
    }

    #[test]
    fn test_fold_case_mixed_ascii_and_non_ascii() {
        // ASCII uppercase + non-ASCII uppercase in one string.
        assert_eq!(fold_case_impl("ABC Straße ÄÖÜ"), "abc strasse äöü");
    }

    #[test]
    fn test_fold_case_mixed_cjk_and_latin() {
        // CJK passes through; Latin folds.
        assert_eq!(fold_case_impl("Hello 你好 WORLD"), "hello 你好 world");
    }

    // ── Identity / passthrough ───────────────────────────────────────

    #[test]
    fn test_fold_case_identity_cjk() {
        assert_eq!(fold_case_impl("你好世界"), "你好世界");
    }

    #[test]
    fn test_fold_case_identity_emoji() {
        assert_eq!(fold_case_impl("🎉🚀💡"), "🎉🚀💡");
    }

    #[test]
    fn test_fold_case_identity_already_folded() {
        // Already-folded non-ASCII should pass through unchanged.
        assert_eq!(fold_case_impl("café résumé naïve"), "café résumé naïve");
    }

    // ── Edge cases ───────────────────────────────────────────────────

    #[test]
    fn test_fold_case_string_length_grows() {
        // ß→ss doubles the char; verify the output length is correct.
        assert_eq!(fold_case_impl("ßßß"), "ssssss");
        assert_eq!(fold_case_impl("ßßß").len(), 6);
    }

    #[test]
    fn test_fold_case_combining_characters_preserved() {
        // Combining marks that are not in CaseFolding.txt pass through.
        // é as e + combining acute accent
        let input = "e\u{0301}";
        assert_eq!(fold_case_impl(input), input);
    }

    #[test]
    fn test_fold_case_null_byte() {
        // Null byte is valid in the middle of a Rust &str.
        assert_eq!(fold_case_impl("A\0B"), "a\0b");
    }

    #[test]
    fn test_fold_case_surrogate_boundary() {
        // Characters near the BMP boundary.
        // U+FFFF is not a case-folding entry → identity.
        assert_eq!(fold_case_impl("\u{FFFF}"), "\u{FFFF}");
        // U+10000 (𐀀 Linear B Syllable B008 A) → identity.
        assert_eq!(fold_case_impl("\u{10000}"), "\u{10000}");
    }

    #[test]
    fn test_fold_case_deseret() {
        // Deseret capital 𐐀 (U+10400) → small 𐐨 (U+10428)
        assert_eq!(fold_case_impl("\u{10400}"), "\u{10428}");
    }

    #[test]
    fn test_fold_case_osage() {
        // Osage capital 𐒰 (U+104B0) → small 𐓘 (U+104D8)
        assert_eq!(fold_case_impl("\u{104B0}"), "\u{104D8}");
    }

    #[test]
    fn test_fold_case_warang_citi() {
        // Warang Citi capital 𑢠 (U+118A0) → small 𑣀 (U+118C0)
        assert_eq!(fold_case_impl("\u{118A0}"), "\u{118C0}");
    }

    #[test]
    fn test_fold_case_agrees_with_casefolding_txt() {
        // Spot-check a handful of entries across the full range
        // to verify the PHF data matches CaseFolding.txt expectations.
        let cases: &[(char, &str)] = &[
            ('A', "a"),
            ('Z', "z"),
            ('À', "à"),                       // U+00C0 → U+00E0
            ('Ð', "ð"),                       // U+00D0 → U+00F0
            ('Ø', "ø"),                       // U+00D8 → U+00F8
            ('Ʃ', "ʃ"),                       // U+01A9 → U+0283
            ('Ω', "ω"),                       // U+03A9 → U+03C9
            ('Ж', "ж"),                       // U+0416 → U+0436
            ('\u{0587}', "\u{0565}\u{0582}"), // Armenian և → եւ
        ];
        for &(input, expected) in cases {
            let got = fold_case_impl(&input.to_string());
            assert_eq!(
                got, expected,
                "fold_case(U+{:04X} {:?}) = {:?}, expected {:?}",
                input as u32, input, got, expected
            );
        }
    }

    // ── Property-based tests ─────────────────────────────────────────

    mod proptest_properties {
        use super::*;
        use proptest::prelude::*;

        proptest! {
            #![proptest_config(ProptestConfig::with_cases(1000))]

            /// Case folding is idempotent: fold(fold(x)) == fold(x).
            #[test]
            fn fold_case_idempotent(s in "\\PC*") {
                let once = fold_case_impl(&s);
                let twice = fold_case_impl(&once);
                prop_assert_eq!(&once, &twice);
            }

            /// After folding, no ASCII uppercase letters remain.
            #[test]
            fn fold_case_no_ascii_uppercase(s in "\\PC*") {
                let result = fold_case_impl(&s);
                for ch in result.chars() {
                    if ch.is_ascii() {
                        prop_assert!(
                            !ch.is_ascii_uppercase(),
                            "uppercase {ch:?} in fold output: {result:?}"
                        );
                    }
                }
            }

            /// Output char count ≥ input char count (folding never drops characters,
            /// though byte length may shrink for ligatures like ſt → st).
            #[test]
            fn fold_case_never_drops_chars(s in "\\PC*") {
                let result = fold_case_impl(&s);
                prop_assert!(
                    result.chars().count() >= s.chars().count(),
                    "fold_case dropped chars: {} → {}",
                    s.chars().count(),
                    result.chars().count()
                );
            }

            /// Pure ASCII input stays pure ASCII after folding.
            #[test]
            fn fold_case_ascii_stays_ascii(s in "[\\x00-\\x7f]*") {
                let result = fold_case_impl(&s);
                prop_assert!(
                    result.is_ascii(),
                    "non-ASCII in fold of ASCII input: {result:?}"
                );
            }
        }
    }
}