disarm 0.10.0

Unicode canonicalization and TR39 confusable analysis: building blocks for text-security pipelines (homoglyph/bidi/zalgo handling) plus standards-based transliteration
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
//! Context-aware transliteration for abjad scripts (Arabic, Persian, Hebrew).
//!
//! Uses dictionary-based vowel restoration with bigram context disambiguation.
//! The dictionary maps consonant skeletons (unpointed text) to diacritized
//! forms, which are then transliterated by the existing character-by-character
//! engine.
//!
//! Three-tier fallback:
//! 1. Bigram lookup: (previous_word_skeleton, current_word_skeleton) → best form
//! 2. Unigram lookup: current_word_skeleton → most frequent form
//! 3. Context-free: existing character-by-character transliteration

use std::borrow::Cow;
use std::sync::OnceLock;

/// Tatweel (kashida) — decorative elongation in Arabic.
const TATWEEL: char = '\u{0640}';

/// Binary dictionary format magic bytes.
const MAGIC: &[u8; 4] = b"TRLD";

// `MAX_DICT_ENTRIES` (the `from_bytes` capacity-hint clamp, #116) is centralized
// in `crate::limits` (#256).
use crate::limits::MAX_DICT_ENTRIES;

/// A `(offset, len)` slice into the backing dictionary bytes. `u32` is ample:
/// dictionaries are a few MB and the on-disk format already uses `u32` offsets.
#[derive(Clone, Copy)]
struct Span {
    off: u32,
    len: u32,
}

/// One unigram: its consonant skeleton and its single best (most frequent)
/// diacritized form. Entries are sorted by skeleton bytes for binary search.
#[derive(Clone, Copy)]
struct UniEntry {
    skel: Span,
    form: Span,
}

/// One bigram entry within a previous-word group: the current-word skeleton and
/// the diacritized form to use. Sorted by `curr` bytes within the group.
#[derive(Clone, Copy)]
struct BiEntry {
    curr: Span,
    form: Span,
}

/// A previous-word group: every bigram entry sharing this `prev` skeleton
/// occupies the contiguous `[start, start + len)` range of `bi_entries`. Groups
/// are sorted by `prev` bytes. This nesting preserves the cheap two-step
/// prev → curr lookup — `resolve` probes with borrowed `&str` keys and never
/// allocates an owned `(prev, curr)` tuple per token (#238 guardrail).
#[derive(Clone, Copy)]
struct BiGroup {
    prev: Span,
    start: u32,
    len: u32,
}

/// Context dictionary backed directly by the raw `.bin` bytes (#238).
///
/// The dictionary strings live exactly **once** — in `data` (borrowed `'static`
/// from the embedded `.rodata` table, or owned from a filesystem read). The
/// lookup indices below hold only `(offset, len)` spans into `data`, so no
/// skeleton or form is duplicated on the heap. Resident cost is the file size
/// plus the fixed-size index vectors, replacing the former nested
/// `HashMap<String, …>` that copied every string a second time (the 2×→1×
/// residency win). Lookup is binary search (`O(log n)`) instead of hashing
/// (`O(1)`) — an accepted trade for this opt-in context path.
pub struct ContextDict {
    /// The whole dictionary file: borrowed `'static` (embedded) or owned (fs).
    data: Cow<'static, [u8]>,
    /// Unigram entries, sorted by skeleton bytes.
    unigrams: Vec<UniEntry>,
    /// Bigram previous-word groups, sorted by `prev` bytes.
    bi_groups: Vec<BiGroup>,
    /// Bigram entries, partitioned by `BiGroup` range and sorted by `curr` within each.
    bi_entries: Vec<BiEntry>,
}

/// Read a little-endian u16 at `pos`, returning an error rather than panicking
/// if the slice is too short. (`forbid(unsafe_code)` is in force, so an OOB
/// index would panic and abort the process — these helpers turn a malformed or
/// truncated dictionary into a recoverable `Err`.)
fn read_u16(data: &[u8], pos: usize) -> Result<u16, String> {
    let end = pos.checked_add(2).ok_or("dictionary offset overflow")?;
    let slice = data
        .get(pos..end)
        .ok_or("unexpected end of dictionary data")?;
    Ok(u16::from_le_bytes(
        slice.try_into().unwrap(), // infallible: slice is exactly 2 bytes (bounds-checked above)
    ))
}

/// Read a little-endian u32 at `pos`, bounds-checked (see [`read_u16`]).
fn read_u32(data: &[u8], pos: usize) -> Result<u32, String> {
    let end = pos.checked_add(4).ok_or("dictionary offset overflow")?;
    let slice = data
        .get(pos..end)
        .ok_or("unexpected end of dictionary data")?;
    Ok(u32::from_le_bytes(
        slice.try_into().unwrap(), // infallible: slice is exactly 4 bytes (bounds-checked above)
    ))
}

/// Validate that `data[pos..pos + len]` is in bounds and valid UTF-8, returning
/// its span (offset + length) — no allocation. Bounds-checked like [`read_u16`];
/// errors (never panics) on a truncated or non-UTF-8 region. The span's bytes
/// stay valid for the lifetime of `data`, which the [`ContextDict`] owns.
fn read_str_span(data: &[u8], pos: usize, len: usize) -> Result<Span, String> {
    let end = pos.checked_add(len).ok_or("dictionary offset overflow")?;
    let slice = data
        .get(pos..end)
        .ok_or("unexpected end of dictionary data")?;
    std::str::from_utf8(slice).map_err(|e| e.to_string())?;
    Ok(Span {
        off: u32::try_from(pos).map_err(|_| "dictionary offset exceeds u32".to_string())?,
        len: u32::try_from(len).map_err(|_| "string length exceeds u32".to_string())?,
    })
}

impl ContextDict {
    /// Load a context dictionary from a borrowed buffer, copying it once into an
    /// owned backing store. Used by tests and any caller without `'static` data;
    /// the dictionary strings still live exactly once (inside the copy).
    ///
    /// Every read is bounds-checked: a truncated or malformed buffer yields an
    /// `Err` instead of an out-of-bounds panic.
    // Exercised by the unit tests; the runtime loaders use `from_owned`/`from_static`.
    #[cfg_attr(not(test), allow(dead_code))]
    pub fn from_bytes(data: &[u8]) -> Result<Self, String> {
        Self::build(Cow::Owned(data.to_vec()))
    }

    /// Load a context dictionary directly from `'static` bytes (the embedded
    /// `.rodata` table). True zero-copy — the bytes are borrowed, never copied.
    // Only the `embed-dicts` loader calls this; allowed-unused otherwise.
    #[cfg_attr(not(feature = "embed-dicts"), allow(dead_code))]
    pub fn from_static(data: &'static [u8]) -> Result<Self, String> {
        Self::build(Cow::Borrowed(data))
    }

    /// Load a context dictionary taking ownership of a buffer (e.g. a filesystem
    /// read), reusing it as the backing store without an extra copy.
    pub fn from_owned(data: Vec<u8>) -> Result<Self, String> {
        Self::build(Cow::Owned(data))
    }

    /// Build the zero-copy index over `data` (#238): parse the binary format
    /// produced by `scripts/build_*_dict.py` once, recording `(offset, len)`
    /// spans into `data` rather than copying strings onto the heap. The index
    /// vectors are sorted so `resolve` can binary-search them.
    fn build(data: Cow<'static, [u8]>) -> Result<Self, String> {
        // A bigram record before grouping: parsed flat, then sorted into groups.
        struct RawBi {
            prev: Span,
            curr: Span,
            form: Span,
        }

        let bytes: &[u8] = &data;
        if bytes.len() < 24 {
            return Err("Dictionary too small".into());
        }
        if &bytes[0..4] != MAGIC {
            return Err("Invalid dictionary magic".into());
        }
        let version = read_u32(bytes, 4)?;
        if version != 1 {
            return Err(format!("Unsupported dictionary version: {version}"));
        }
        let unigram_count = read_u32(bytes, 8)? as usize;
        let bigram_count = read_u32(bytes, 12)? as usize;
        let unigram_offset = read_u32(bytes, 16)? as usize;
        let bigram_offset = read_u32(bytes, 20)? as usize;
        // Section offsets must point past the 24-byte header. Reads are already
        // bounds-checked (no panic), but rejecting offsets that start inside the
        // header avoids silently returning Ok(...) for a clearly malformed
        // buffer whose sections would overlap the header fields.
        if unigram_offset < 24 || bigram_offset < 24 {
            return Err("Dictionary section offset overlaps header".into());
        }

        // Borrow the span's bytes for ordering comparisons. Spans are produced by
        // `read_str_span` against this same `bytes`, so the range is always valid.
        let span_bytes = |s: Span| &bytes[s.off as usize..s.off as usize + s.len as usize];

        // --- Unigrams: skeleton -> best (first, most-frequent) form span ---
        // `min(MAX_DICT_ENTRIES)` clamps only the *pre-reservation* (#116/#200) so a
        // bogus huge `unigram_count` header cannot drive a giant up-front alloc; it
        // is NOT an entry cap — a genuine dict with more entries still grows past it
        // as the loop pushes, bounded by the actual buffer bytes each entry consumes.
        let mut unigrams: Vec<UniEntry> = Vec::with_capacity(unigram_count.min(MAX_DICT_ENTRIES));
        let mut pos = unigram_offset;
        for _ in 0..unigram_count {
            let skel_len = read_u16(bytes, pos)? as usize;
            pos += 2;
            let skel = read_str_span(bytes, pos, skel_len)?;
            pos += skel_len;

            let num_forms = read_u16(bytes, pos)? as usize;
            pos += 2;

            let mut best: Option<Span> = None;
            for i in 0..num_forms {
                let form_len = read_u16(bytes, pos)? as usize;
                pos += 2;
                let form = read_str_span(bytes, pos, form_len)?;
                pos += form_len;
                let _freq = read_u32(bytes, pos)?;
                pos += 4;
                // Forms are stored most-frequent-first, and `resolve` only ever
                // wants the best one, so keep just the first and skip the rest.
                if i == 0 {
                    best = Some(form);
                }
            }
            // A unigram with zero forms yields nothing resolvable — omit it.
            if let Some(form) = best {
                unigrams.push(UniEntry { skel, form });
            }
        }

        // --- Bigrams: flat (prev, curr, form), parsed then sorted and grouped ---
        // As above (C6): the clamp bounds the pre-reservation only, not the entry
        // count; growth past it is bounded by the buffer bytes.
        let mut raw: Vec<RawBi> = Vec::with_capacity(bigram_count.min(MAX_DICT_ENTRIES));
        pos = bigram_offset;
        for _ in 0..bigram_count {
            let prev_len = read_u16(bytes, pos)? as usize;
            pos += 2;
            let prev = read_str_span(bytes, pos, prev_len)?;
            pos += prev_len;

            let curr_len = read_u16(bytes, pos)? as usize;
            pos += 2;
            let curr = read_str_span(bytes, pos, curr_len)?;
            pos += curr_len;

            let form_len = read_u16(bytes, pos)? as usize;
            pos += 2;
            let form = read_str_span(bytes, pos, form_len)?;
            pos += form_len;

            raw.push(RawBi { prev, curr, form });
        }

        // Sort unigrams by skeleton bytes (UTF-8 byte order == code-point order,
        // so byte comparison is a valid total order for binary search). Dedup any
        // duplicate skeletons — the builders emit unique keys, so this only makes
        // a malformed dict deterministic rather than changing real behaviour.
        unigrams.sort_by(|a, b| span_bytes(a.skel).cmp(span_bytes(b.skel)));
        unigrams.dedup_by(|a, b| span_bytes(a.skel) == span_bytes(b.skel));

        // Sort bigrams by (prev, curr) bytes, then partition into prev-groups.
        raw.sort_by(|a, b| {
            span_bytes(a.prev)
                .cmp(span_bytes(b.prev))
                .then_with(|| span_bytes(a.curr).cmp(span_bytes(b.curr)))
        });
        raw.dedup_by(|a, b| {
            span_bytes(a.prev) == span_bytes(b.prev) && span_bytes(a.curr) == span_bytes(b.curr)
        });

        let mut bi_entries: Vec<BiEntry> = Vec::with_capacity(raw.len());
        let mut bi_groups: Vec<BiGroup> = Vec::new();
        let mut i = 0usize;
        while i < raw.len() {
            let prev = raw[i].prev;
            let start = bi_entries.len();
            let mut j = i;
            while j < raw.len() && span_bytes(raw[j].prev) == span_bytes(prev) {
                bi_entries.push(BiEntry {
                    curr: raw[j].curr,
                    form: raw[j].form,
                });
                j += 1;
            }
            bi_groups.push(BiGroup {
                prev,
                start: u32::try_from(start).map_err(|_| "bigram index exceeds u32".to_string())?,
                len: u32::try_from(bi_entries.len() - start)
                    .map_err(|_| "bigram group exceeds u32".to_string())?,
            });
            i = j;
        }

        Ok(ContextDict {
            data,
            unigrams,
            bi_groups,
            bi_entries,
        })
    }

    /// Bytes of a span. Spans are bounds-validated at build time against this
    /// same `data`, which is immutable thereafter, so the index never panics.
    #[inline]
    fn span_slice(&self, span: Span) -> &[u8] {
        &self.data[span.off as usize..span.off as usize + span.len as usize]
    }

    /// The span as `&str`. UTF-8 was validated at build time; re-validate cheaply
    /// and fall back to `""` rather than risk a panic if that invariant is ever
    /// broken by a future change.
    #[inline]
    fn span_str(&self, span: Span) -> &str {
        std::str::from_utf8(self.span_slice(span)).unwrap_or("")
    }

    /// Resolve a word using bigram context, then unigram fallback.
    ///
    /// Returns the best diacritized form, or `None` if not in the dictionary.
    /// Comparisons are on raw bytes (UTF-8 byte order matches the sort order), so
    /// the binary searches never re-validate UTF-8; only the matched form is
    /// decoded to `&str` once, on the way out.
    pub fn resolve(&self, prev_skeleton: Option<&str>, curr_skeleton: &str) -> Option<&str> {
        let curr = curr_skeleton.as_bytes();

        // Tier 1: bigram lookup — two-step prev -> curr, both borrowed &str keys,
        // no per-token owned-key allocation (#238 guardrail preserved).
        if let Some(prev) = prev_skeleton {
            let prev_bytes = prev.as_bytes();
            if let Ok(gi) = self
                .bi_groups
                .binary_search_by(|g| self.span_slice(g.prev).cmp(prev_bytes))
            {
                let g = self.bi_groups[gi];
                let entries = &self.bi_entries[g.start as usize..(g.start + g.len) as usize];
                if let Ok(ei) = entries.binary_search_by(|e| self.span_slice(e.curr).cmp(curr)) {
                    return Some(self.span_str(entries[ei].form));
                }
            }
        }

        // Tier 2: unigram lookup (most frequent form).
        if let Ok(ui) = self
            .unigrams
            .binary_search_by(|e| self.span_slice(e.skel).cmp(curr))
        {
            return Some(self.span_str(self.unigrams[ui].form));
        }

        // Tier 3: not in dictionary — caller uses context-free transliteration.
        None
    }

    /// Return dictionary statistics: (unigram count, total bigram entry count).
    // Used by the unit tests to assert dictionary parsing.
    #[cfg_attr(not(test), allow(dead_code))]
    pub fn stats(&self) -> (usize, usize) {
        (self.unigrams.len(), self.bi_entries.len())
    }
}

/// Strip Arabic diacritics (tashkeel) and tatweel from a word.
pub fn strip_arabic_diacritics(word: &str) -> String {
    word.chars()
        .filter(|&c| !is_arabic_diacritic(c) && c != TATWEEL)
        .collect()
}

/// Strip Hebrew niqqud (vowel points) from a word.
pub fn strip_hebrew_niqqud(word: &str) -> String {
    word.chars().filter(|&c| !is_hebrew_niqqud(c)).collect()
}

/// Strip diacritics appropriate for the given language.
pub fn strip_diacritics(word: &str, lang: Option<&str>) -> String {
    match lang {
        Some("he") => strip_hebrew_niqqud(word),
        _ => strip_arabic_diacritics(word), // Arabic and Persian use same diacritics
    }
}

/// Check if a character is Arabic script.
fn is_arabic_char(c: char) -> bool {
    matches!(c as u32,
        0x0600..=0x06FF |
        0x0750..=0x077F |
        0x08A0..=0x08FF |
        0xFB50..=0xFDFF |
        0xFE70..=0xFEFF
    )
}

/// Check if a character is Hebrew script.
fn is_hebrew_char(c: char) -> bool {
    matches!(c as u32, 0x0590..=0x05FF | 0xFB1D..=0xFB4F)
}

/// True if `c` is an Arabic diacritic (tashkeel): U+064B–U+0655 plus U+0670
/// (SUPERSCRIPT ALEF). O(1) range check rather than a linear scan (#108/#200).
#[inline]
fn is_arabic_diacritic(c: char) -> bool {
    matches!(c as u32, 0x064B..=0x0655 | 0x0670)
}

/// True if `c` is Hebrew niqqud (vowel point): U+05B0–U+05C5 minus U+05BE
/// (MAQAF), U+05C0 (PASEQ), and U+05C3 (SOF PASUQ). O(1) range check (#108/#200).
#[inline]
fn is_hebrew_niqqud(c: char) -> bool {
    matches!(c as u32, 0x05B0..=0x05C5) && !matches!(c as u32, 0x05BE | 0x05C0 | 0x05C3)
}

/// Tokenize text into words and non-word spans (whitespace, punctuation).
pub fn tokenize(text: &str) -> Vec<Token<'_>> {
    // #115: each token is a contiguous run of same-class characters, so it is a
    // borrowed slice of the input — no per-token String allocation. We track the
    // current run's start byte offset and emit `Cow::Borrowed(&text[start..i])`
    // when the class flips.
    #[inline]
    fn is_word_char(c: char) -> bool {
        // #108: O(1) codepoint range/mask checks (not O(N) slice scans). The
        // diacritic/niqqud predicates are shared with the strip_* functions so
        // the ranges have a single definition (#200).
        is_arabic_char(c)
            || is_hebrew_char(c)
            || is_arabic_diacritic(c)
            || is_hebrew_niqqud(c)
            || c == TATWEEL
    }

    let mut tokens = Vec::new();
    let mut span_start = 0usize;
    let mut in_word = false;
    let mut started = false;

    for (i, c) in text.char_indices() {
        let word = is_word_char(c);
        if !started {
            span_start = i;
            in_word = word;
            started = true;
        } else if word != in_word {
            // Class flip — emit the completed run [span_start..i) as a borrowed slice.
            tokens.push(Token {
                text: Cow::Borrowed(&text[span_start..i]),
                is_word: in_word,
            });
            span_start = i;
            in_word = word;
        }
    }

    if started {
        tokens.push(Token {
            text: Cow::Borrowed(&text[span_start..]),
            is_word: in_word,
        });
    }

    tokens
}

/// A token from Arabic/Hebrew text tokenization.
///
/// `text` is a `Cow<str>`: [`tokenize`] always returns `Cow::Borrowed` slices of
/// the input (#115) — tokenization never rewrites characters, so it allocates
/// nothing per token — but the type also lets callers construct or transform a
/// `Token` holding `Cow::Owned` text when needed.
#[derive(Debug, Clone)]
pub struct Token<'a> {
    /// The token text — a word or whitespace/punctuation span. Borrowed from the
    /// input by [`tokenize`]; may be owned if a caller constructs the token.
    pub text: Cow<'a, str>,
    /// True if this token is a word (Arabic/Hebrew script), false for non-word spans.
    pub is_word: bool,
}

/// A "hard" boundary that resets bigram context (#101): newlines and
/// sentence-final punctuation. A plain inter-word space is deliberately *not* a
/// boundary, so the bigram disambiguation tier fires across adjacent words.
fn is_context_boundary(text: &str) -> bool {
    text.chars().any(|c| {
        matches!(c, '\n' | '\r' | '.' | '!' | '?') || matches!(c as u32, 0x061F | 0x06D4)
        // ؟ Arabic question mark, ۔ Arabic full stop
    })
}

/// Context-aware transliteration: resolve words via dictionary, then
/// transliterate the diacritized forms using the existing engine.
pub fn transliterate_context(
    text: &str,
    lang: Option<&str>,
    dict: &ContextDict,
    transliterate_fn: impl Fn(&str, Option<&str>) -> String,
) -> String {
    let tokens = tokenize(text);
    let mut result = String::with_capacity(text.len());
    let mut prev_skeleton: Option<String> = None;

    for token in &tokens {
        if !token.is_word {
            // Non-word (whitespace, punctuation) — pass through.
            result.push_str(&token.text);
            // #101: a plain inter-word space must NOT clear bigram context, or
            // the bigram disambiguation tier is unreachable in normal
            // (space-separated) prose. Only a hard boundary — a newline or
            // sentence-final punctuation — resets the previous-word context.
            if is_context_boundary(&token.text) {
                prev_skeleton = None;
            }
            continue;
        }

        let skeleton = strip_diacritics(&token.text, lang);

        // Try dictionary resolution (bigram → unigram → fallback)
        let resolved = dict.resolve(prev_skeleton.as_deref(), &skeleton);

        match resolved {
            Some(diacritized) => {
                // Dictionary found a diacritized form — transliterate it
                result.push_str(&transliterate_fn(diacritized, lang));
            }
            None => {
                // Not in dictionary — use context-free transliteration on original
                result.push_str(&transliterate_fn(&token.text, lang));
            }
        }

        prev_skeleton = Some(skeleton);
    }

    result
}

// ---------------------------------------------------------------------------
// Global dictionary singletons (loaded lazily)
// ---------------------------------------------------------------------------

/// Outcome of a dictionary load attempt. (#107)
///
/// Distinguished so that `_transliterate_context` can surface a "corrupt"
/// error message that differs from the "not found / run bootstrap_dicts.sh"
/// message — a corrupt dict requires a different remediation than an absent one.
pub enum DictState {
    /// Dictionary loaded successfully.
    Ok(ContextDict),
    /// No dictionary file was found in any search path.
    Absent,
    /// A file was found but could not be parsed; includes the error message.
    Corrupt(String),
}

static ARABIC_DICT: OnceLock<DictState> = OnceLock::new();
static PERSIAN_DICT: OnceLock<DictState> = OnceLock::new();
static HEBREW_DICT: OnceLock<DictState> = OnceLock::new();

// With embed-dicts, dictionaries are compiled into the binary.
// Without it, they're loaded from the filesystem at runtime.
#[cfg(feature = "embed-dicts")]
static ARABIC_DATA: &[u8] = include_bytes!("../data/arabic_dict.bin");
#[cfg(feature = "embed-dicts")]
static PERSIAN_DATA: &[u8] = include_bytes!("../data/persian_dict.bin");
#[cfg(feature = "embed-dicts")]
static HEBREW_DATA: &[u8] = include_bytes!("../data/hebrew_dict.bin");

/// Parse an embedded dictionary. (#107: returns `DictState` to distinguish
/// parse errors from absence; #106: routes diagnostics through `emit_warning_stderr`.)
#[cfg(feature = "embed-dicts")]
fn load_embedded_dict(name: &str, data: &'static [u8]) -> DictState {
    // Zero-copy: borrow the embedded `.rodata` bytes directly (#238).
    match ContextDict::from_static(data) {
        Ok(dict) => DictState::Ok(dict),
        Err(e) => {
            let msg = format!("disarm: failed to load embedded {name} dict: {e}");
            // #106: route through shared helper so Python applications can capture
            // this diagnostic via warnings/logging.
            crate::emit_warning_stderr(&msg);
            DictState::Corrupt(e)
        }
    }
}

/// Candidate filesystem locations for a context dictionary, in priority order.
///
/// Security (#61): dictionaries are **never** loaded from a current-working-
/// directory-relative path. A process whose CWD an attacker can influence — or
/// where an attacker can drop `./data/` — could otherwise inject an
/// attacker-controlled dictionary and silently change transliteration output.
/// Both returned paths are absolute and not attacker-influenceable:
///
/// 1. `$DISARM_DICT_DIR/{name}_dict.bin` — explicit opt-in for installed
///    wheels. Build the dictionaries with `scripts/bootstrap_dicts.sh` and
///    point `DISARM_DICT_DIR` at the output directory. **A relative
///    `DISARM_DICT_DIR` is rejected** (warn + ignore): a relative value would
///    reintroduce exactly the CWD-relative dictionary loading #61 removed, just
///    via the env var. The directory must be an absolute path.
/// 2. `$CARGO_MANIFEST_DIR/data/{name}_dict.bin` — source/development builds
///    only; a compile-time absolute path baked into the binary.
#[cfg(not(feature = "embed-dicts"))]
fn dict_search_paths(name: &str) -> Vec<std::path::PathBuf> {
    let mut paths: Vec<std::path::PathBuf> = Vec::new();
    if let Some(dir) = std::env::var_os("DISARM_DICT_DIR") {
        let dir = std::path::Path::new(&dir);
        if dir.is_absolute() {
            paths.push(dir.join(format!("{name}_dict.bin")));
        } else {
            // #106: route through shared helper so Python applications can capture
            // this diagnostic via warnings/logging rather than having it go directly
            // to stderr, invisible to Python's warnings module.
            crate::emit_warning_stderr(&format!(
                "disarm: ignoring relative DISARM_DICT_DIR={:?}; an absolute path is \
                 required (security #61: no CWD-relative dictionary loading).",
                dir.display()
            ));
        }
    }
    paths.push(std::path::PathBuf::from(format!(
        "{}/data/{name}_dict.bin",
        env!("CARGO_MANIFEST_DIR")
    )));
    paths
}

/// Load a context dictionary from the first existing [`dict_search_paths`]
/// location. (#107: returns `DictState` to distinguish "file absent" from
/// "file present but corrupt"; #106: routes diagnostics through `emit_warning_stderr`.)
#[cfg(not(feature = "embed-dicts"))]
fn load_dict_from_fs(name: &str) -> DictState {
    let paths = dict_search_paths(name);
    for path in &paths {
        if let Ok(data) = std::fs::read(path) {
            // Reuse the read buffer as the backing store — no extra copy (#238).
            match ContextDict::from_owned(data) {
                Ok(dict) => return DictState::Ok(dict),
                Err(e) => {
                    // File exists but is malformed — a distinct error from "not found".
                    // #106: route through shared helper so Python applications can capture
                    // this diagnostic via warnings/logging.
                    crate::emit_warning_stderr(&format!(
                        "disarm: failed to load {name} dict from {}: {e}",
                        path.display()
                    ));
                    return DictState::Corrupt(format!(
                        "{name} dictionary at {} is corrupt: {e}",
                        path.display()
                    ));
                }
            }
        }
    }
    DictState::Absent
}

/// Try to load the Arabic context dictionary.
///
/// Returns:
/// - `Ok(Some(dict))` — loaded successfully
/// - `Ok(None)` — no dictionary file found (run `bootstrap_dicts.sh`)
/// - `Err(msg)` — file found but corrupt (#107)
pub fn get_arabic_dict() -> Result<Option<&'static ContextDict>, &'static str> {
    match ARABIC_DICT.get_or_init(|| {
        #[cfg(feature = "embed-dicts")]
        {
            load_embedded_dict("arabic", ARABIC_DATA)
        }
        #[cfg(not(feature = "embed-dicts"))]
        {
            load_dict_from_fs("arabic")
        }
    }) {
        DictState::Ok(d) => Ok(Some(d)),
        DictState::Absent => Ok(None),
        DictState::Corrupt(msg) => Err(msg.as_str()),
    }
}

/// Try to load the Persian context dictionary.
///
/// Returns:
/// - `Ok(Some(dict))` — loaded successfully
/// - `Ok(None)` — no dictionary file found (run `bootstrap_dicts.sh`)
/// - `Err(msg)` — file found but corrupt (#107)
pub fn get_persian_dict() -> Result<Option<&'static ContextDict>, &'static str> {
    match PERSIAN_DICT.get_or_init(|| {
        #[cfg(feature = "embed-dicts")]
        {
            load_embedded_dict("persian", PERSIAN_DATA)
        }
        #[cfg(not(feature = "embed-dicts"))]
        {
            load_dict_from_fs("persian")
        }
    }) {
        DictState::Ok(d) => Ok(Some(d)),
        DictState::Absent => Ok(None),
        DictState::Corrupt(msg) => Err(msg.as_str()),
    }
}

/// Try to load the Hebrew context dictionary.
///
/// Returns:
/// - `Ok(Some(dict))` — loaded successfully
/// - `Ok(None)` — no dictionary file found (run `bootstrap_dicts.sh`)
/// - `Err(msg)` — file found but corrupt (#107)
pub fn get_hebrew_dict() -> Result<Option<&'static ContextDict>, &'static str> {
    match HEBREW_DICT.get_or_init(|| {
        #[cfg(feature = "embed-dicts")]
        {
            load_embedded_dict("hebrew", HEBREW_DATA)
        }
        #[cfg(not(feature = "embed-dicts"))]
        {
            load_dict_from_fs("hebrew")
        }
    }) {
        DictState::Ok(d) => Ok(Some(d)),
        DictState::Absent => Ok(None),
        DictState::Corrupt(msg) => Err(msg.as_str()),
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_strip_arabic_diacritics() {
        // كَتَبَ (kataba) → كتب (ktb)
        assert_eq!(strip_arabic_diacritics("كَتَبَ"), "كتب");
        // درَّسَ (darrasa, with shadda) → درس
        assert_eq!(strip_arabic_diacritics("دَرَّسَ"), "درس");
    }

    #[test]
    fn test_strip_hebrew_niqqud() {
        // שָׁלוֹם (shalom with niqqud) → שלום
        assert_eq!(strip_hebrew_niqqud("שָׁלוֹם"), "שלום");
    }

    #[test]
    fn test_strip_tatweel() {
        assert_eq!(strip_arabic_diacritics("كـتـاب"), "كتاب");
    }

    #[test]
    fn test_tokenize_arabic() {
        let tokens = tokenize("كتب العربية");
        assert_eq!(tokens.len(), 3); // word, space, word
        assert!(tokens[0].is_word);
        assert!(!tokens[1].is_word);
        assert!(tokens[2].is_word);
    }

    #[test]
    fn test_tokenize_mixed() {
        let tokens = tokenize("hello كتب world");
        // "hello " is non-word, "كتب" is word, " world" is non-word
        assert!(tokens.len() >= 3);
    }

    /// Serialize unigrams (`skeleton -> [(form, freq)]`) and bigrams
    /// (`(prev, curr, form)`) into the on-disk binary format, exercising the real
    /// `from_bytes` zero-copy build path (#238) instead of constructing the
    /// private index directly.
    fn build_dict_bytes(
        unigrams: &[(&str, &[(&str, u32)])],
        bigrams: &[(&str, &str, &str)],
    ) -> Vec<u8> {
        let mut uni = Vec::new();
        for (skel, forms) in unigrams {
            uni.extend_from_slice(&(skel.len() as u16).to_le_bytes());
            uni.extend_from_slice(skel.as_bytes());
            uni.extend_from_slice(&(forms.len() as u16).to_le_bytes());
            for (form, freq) in *forms {
                uni.extend_from_slice(&(form.len() as u16).to_le_bytes());
                uni.extend_from_slice(form.as_bytes());
                uni.extend_from_slice(&freq.to_le_bytes());
            }
        }
        let mut bi = Vec::new();
        for (prev, curr, form) in bigrams {
            bi.extend_from_slice(&(prev.len() as u16).to_le_bytes());
            bi.extend_from_slice(prev.as_bytes());
            bi.extend_from_slice(&(curr.len() as u16).to_le_bytes());
            bi.extend_from_slice(curr.as_bytes());
            bi.extend_from_slice(&(form.len() as u16).to_le_bytes());
            bi.extend_from_slice(form.as_bytes());
        }
        let unigram_offset = 24u32;
        let bigram_offset = 24 + uni.len() as u32;
        let mut data = Vec::new();
        data.extend_from_slice(MAGIC);
        data.extend_from_slice(&1u32.to_le_bytes()); // version
        data.extend_from_slice(&(unigrams.len() as u32).to_le_bytes());
        data.extend_from_slice(&(bigrams.len() as u32).to_le_bytes());
        data.extend_from_slice(&unigram_offset.to_le_bytes());
        data.extend_from_slice(&bigram_offset.to_le_bytes());
        data.extend_from_slice(&uni);
        data.extend_from_slice(&bi);
        data
    }

    #[test]
    fn test_context_dict_resolve() {
        // كتب → [kataba (most frequent), kutub]; bigram (ال, كتب) → kutub.
        let bytes = build_dict_bytes(
            &[("كتب", &[("كَتَبَ", 100), ("كُتُب", 80)])],
            &[("ال", "كتب", "كُتُب")],
        );
        let dict = ContextDict::from_bytes(&bytes).expect("valid dict should parse");

        // Unigram: most frequent
        assert_eq!(dict.resolve(None, "كتب"), Some("كَتَبَ"));

        // Bigram: after "ال" → kutub
        assert_eq!(dict.resolve(Some("ال"), "كتب"), Some("كُتُب"));

        // Unknown word
        assert_eq!(dict.resolve(None, "xyz"), None);
    }

    #[test]
    fn test_bigram_fires_across_space() {
        // #101: bigram disambiguation must fire for normal space-separated prose.
        // A plain inter-word space must NOT reset the previous-word context.
        let bytes = build_dict_bytes(
            &[("كتب", &[("كَتَبَ", 100)])], // default: kataba
            &[("ال", "كتب", "كُتُب")],     // after "ال" → kutub
        );
        let dict = ContextDict::from_bytes(&bytes).expect("valid dict should parse");

        // Space between the two words: the bigram tier sees prev="ال" → kutub.
        let out = transliterate_context("ال كتب", None, &dict, |s, _| s.to_string());
        assert!(
            out.contains("كُتُب"),
            "space must preserve bigram context: {out}"
        );
        assert!(
            !out.contains("كَتَبَ"),
            "must not fall back to the unigram: {out}"
        );

        // A hard boundary (newline) between the words resets context → unigram.
        let out2 = transliterate_context("ال\nكتب", None, &dict, |s, _| s.to_string());
        assert!(
            out2.contains("كَتَبَ"),
            "newline must reset to the unigram: {out2}"
        );
    }

    #[test]
    fn test_resolve_many_entries_binary_search() {
        // #238: feed entries in NON-sorted input order across multiple skeletons
        // and multiple prev-groups, so the build must sort them and `resolve`
        // must binary-search correctly — not just hit a single-entry index.
        let bytes = build_dict_bytes(
            &[
                // skeletons deliberately out of sorted order
                ("dog", &[("DOG", 9)]),
                ("ant", &[("ANT", 7)]),
                ("cat", &[("CAT-best", 5), ("CAT-alt", 4)]),
                ("bee", &[("BEE", 3)]),
            ],
            &[
                // two prev-groups ("the", "a"), entries out of order within/among
                ("the", "dog", "the-DOG"),
                ("a", "cat", "a-CAT"),
                ("the", "ant", "the-ANT"),
                ("the", "cat", "the-CAT"),
            ],
        );
        let dict = ContextDict::from_bytes(&bytes).expect("valid dict should parse");

        // Unigram tier: every skeleton resolves to its best (first) form.
        assert_eq!(dict.resolve(None, "ant"), Some("ANT"));
        assert_eq!(dict.resolve(None, "bee"), Some("BEE"));
        assert_eq!(dict.resolve(None, "cat"), Some("CAT-best"));
        assert_eq!(dict.resolve(None, "dog"), Some("DOG"));
        assert_eq!(dict.resolve(None, "zzz"), None);

        // Bigram tier: two-step prev → curr across both groups.
        assert_eq!(dict.resolve(Some("the"), "dog"), Some("the-DOG"));
        assert_eq!(dict.resolve(Some("the"), "ant"), Some("the-ANT"));
        assert_eq!(dict.resolve(Some("the"), "cat"), Some("the-CAT"));
        assert_eq!(dict.resolve(Some("a"), "cat"), Some("a-CAT"));
        // Bigram miss falls through to the unigram form.
        assert_eq!(dict.resolve(Some("the"), "bee"), Some("BEE"));
        // Unknown prev → unigram tier.
        assert_eq!(dict.resolve(Some("nope"), "cat"), Some("CAT-best"));

        // (4 unigrams, 4 bigram entries.)
        assert_eq!(dict.stats(), (4, 4));
    }

    /// Build a minimal but valid dictionary buffer: one unigram ("ab" → [("AB", 5)])
    /// and one bigram (("ab", "cd") → "X").
    fn build_valid_dict() -> Vec<u8> {
        build_dict_bytes(&[("ab", &[("AB", 5)])], &[("ab", "cd", "X")])
    }

    #[test]
    fn test_from_bytes_valid_roundtrip() {
        let dict = ContextDict::from_bytes(&build_valid_dict()).expect("valid dict should parse");
        assert_eq!(dict.resolve(None, "ab"), Some("AB"));
        assert_eq!(dict.resolve(Some("ab"), "cd"), Some("X"));
    }

    #[test]
    fn test_from_bytes_rejects_small_and_bad_magic() {
        assert!(ContextDict::from_bytes(&[]).is_err());
        assert!(ContextDict::from_bytes(&[0u8; 10]).is_err());
        let mut bad = build_valid_dict();
        bad[0] = b'X'; // corrupt magic
        assert!(ContextDict::from_bytes(&bad).is_err());
    }

    #[test]
    fn test_from_bytes_truncation_never_panics() {
        // A truncated buffer at any prefix length must return Err, never panic
        // (regression: the parser previously indexed data[pos..pos+N] directly).
        let full = build_valid_dict();
        for n in 0..full.len() {
            let _ = ContextDict::from_bytes(&full[..n]); // must not panic
        }
        // Full buffer still parses.
        assert!(ContextDict::from_bytes(&full).is_ok());
    }

    #[test]
    fn test_from_bytes_bogus_counts_do_not_panic() {
        // Declare an absurd unigram_count with no backing data: must Err, not
        // panic or OOM via a giant capacity allocation.
        let mut data = Vec::new();
        data.extend_from_slice(MAGIC);
        data.extend_from_slice(&1u32.to_le_bytes()); // version
        data.extend_from_slice(&u32::MAX.to_le_bytes()); // unigram_count = 4 billion
        data.extend_from_slice(&0u32.to_le_bytes()); // bigram_count
        data.extend_from_slice(&24u32.to_le_bytes()); // unigram_offset
        data.extend_from_slice(&24u32.to_le_bytes()); // bigram_offset
        assert!(ContextDict::from_bytes(&data).is_err());
    }

    #[test]
    fn test_from_bytes_offset_out_of_range() {
        let mut data = build_valid_dict();
        // Point unigram_offset past the end of the buffer.
        let bad_offset = (data.len() as u32 + 100).to_le_bytes();
        data[16..20].copy_from_slice(&bad_offset);
        assert!(ContextDict::from_bytes(&data).is_err());
    }

    #[test]
    fn test_from_bytes_offset_inside_header_rejected() {
        let mut data = build_valid_dict();
        // Point unigram_offset inside the 24-byte header.
        data[16..20].copy_from_slice(&8u32.to_le_bytes());
        assert!(ContextDict::from_bytes(&data).is_err());
    }

    #[cfg(not(feature = "embed-dicts"))]
    #[test]
    fn test_dict_search_paths_never_cwd_relative() {
        // #61: dictionaries must never be loaded from a CWD-relative path, which
        // an attacker who controls the working directory could populate.
        let paths = dict_search_paths("arabic");
        // The always-present dev fallback (CARGO_MANIFEST_DIR) must be absolute.
        let manifest = paths.last().expect("at least the manifest-dir candidate");
        assert!(
            manifest.is_absolute(),
            "dev dict path must be absolute, got {manifest:?}"
        );
        // No candidate may be the bare CWD-relative form.
        let cwd_relative = std::path::Path::new("data/arabic_dict.bin");
        assert!(
            !paths.iter().any(|p| p == cwd_relative),
            "must not probe the CWD-relative data/ path; got {paths:?}"
        );
        // Stronger invariant: *every* candidate is absolute. A relative
        // DISARM_DICT_DIR is rejected at the source, so no env value can
        // smuggle in a CWD-relative candidate.
        assert!(
            paths.iter().all(|p| p.is_absolute()),
            "all dict search paths must be absolute; got {paths:?}"
        );
    }
}