Skip to main content

iword/
index.rs

1use crate::hash::Hash;
2use crate::{key, Match, Mode};
3use crate::ClassifyResult;
4use std::collections::HashMap;
5use std::fs;
6
7#[cfg(feature = "regex")]
8use regex::RegexSet;
9
10#[cfg(feature = "save")]
11use serde::{Serialize, Deserialize};
12
13const BSIZE: usize = 6;
14
15// ── index internals ───────────────────────────────────────────────────────────
16
17/// Compiled keyword dictionary. Built via [`Dictionary::builder()`] or [`Dictionary::from_file()`].
18///
19/// Scans text in **O(N)** time (N = text bytes), independent of dictionary size.
20pub struct Dictionary {
21    bcount: usize,
22    /// (start_in_entries, count)
23    blocks: Vec<(usize, usize)>,
24    /// (b_masked, key)  key=255 is the internal sentinel (not found)
25    entries: Vec<(u64, u8)>,
26    /// Bitmask of present keys. Bit i set ↔ some word has key==i. Covers keys 0–127.
27    mask: u128,
28    /// Per-word weight keyed by hash.a. Default 1.0.
29    word_weights: HashMap<u64, f32>,
30    /// Per-key weight. Default 1.0.
31    key_weights: [f32; 256],
32    /// Longest word in bytes; caps the slot-active window during scan.
33    max_word_len: usize,
34    /// Regex patterns: (compiled RegexSet, per-pattern (key, weight)).
35    #[cfg(feature = "regex")]
36    regex_patterns: Option<(RegexSet, Vec<(u8, f32)>)>,
37}
38
39// ── DictionaryBuilder ─────────────────────────────────────────────────────────
40
41/// Flexible builder — add words programmatically or load from files/strings.
42pub struct DictionaryBuilder {
43    words: Vec<(String, u8, f32)>,
44    key_weights: [f32; 256],
45    /// Regex patterns: (pattern string, key, weight). Only populated with `regex` feature.
46    #[cfg(feature = "regex")]
47    regex_words: Vec<(String, u8, f32)>,
48}
49
50impl DictionaryBuilder {
51    pub fn new() -> Self {
52        DictionaryBuilder {
53            words: Vec::new(),
54            key_weights: [1.0; 256],
55            #[cfg(feature = "regex")]
56            regex_words: Vec::new(),
57        }
58    }
59
60    /// Add a single word with the given category key (0–254). Weight defaults to 1.0.
61    pub fn add(mut self, word: &str, key: u8) -> Self {
62        if !word.is_empty() { self.words.push((word.to_string(), key, 1.0)); }
63        self
64    }
65
66    /// Add a single word with an explicit per-word weight.
67    pub fn add_weighted(mut self, word: &str, key: u8, weight: f32) -> Self {
68        if !word.is_empty() { self.words.push((word.to_string(), key, weight)); }
69        self
70    }
71
72    /// Add multiple words with the same key. Weight defaults to 1.0.
73    pub fn add_many(mut self, words: &[&str], key: u8) -> Self {
74        for &w in words {
75            if !w.is_empty() { self.words.push((w.to_string(), key, 1.0)); }
76        }
77        self
78    }
79
80    /// Set a per-key weight applied to all words with that key (multiplied with per-word weight).
81    pub fn set_key_weight(mut self, key: u8, weight: f32) -> Self {
82        self.key_weights[key as usize] = weight;
83        self
84    }
85
86    /// Parse a tab-separated word list (compatible with original iWord format).
87    ///
88    /// Format: `word`, `word\tkey`, or `word\tkey\tweight` per line.
89    /// Lines starting with `#` ignored. Weight defaults to 1.0.
90    pub fn load_str(mut self, data: &str) -> Self {
91        for line in data.lines() {
92            let line = line.trim();
93            if line.is_empty() || line.starts_with('#') { continue; }
94            let mut parts = line.splitn(3, '\t');
95            let word = parts.next().unwrap().trim();
96            if word.is_empty() { continue; }
97            // Accept decimal (e.g. "42") or single hex digit (e.g. "a") for
98            // backwards-compatibility with the original iWord tab format.
99            let k: u8 = parts.next()
100                .map(|s| s.trim())
101                .and_then(|s| {
102                    s.parse::<u8>().ok()
103                        .or_else(|| u8::from_str_radix(s, 16).ok())
104                })
105                .unwrap_or(key::DEFAULT);
106            let w: f32 = parts.next()
107                .and_then(|s| s.trim().parse().ok())
108                .unwrap_or(1.0);
109            // Lines of the form /pattern/ are regex patterns (requires `regex` feature).
110            #[cfg(feature = "regex")]
111            if word.starts_with('/') && word.ends_with('/') && word.len() > 2 {
112                let pattern = &word[1..word.len() - 1];
113                self.regex_words.push((pattern.to_string(), k, w));
114                continue;
115            }
116            self.words.push((word.to_string(), k, w));
117        }
118        self
119    }
120
121    /// Load a word list from a file path.
122    pub fn load_file(self, path: &str) -> Result<Self, String> {
123        let data = fs::read_to_string(path)
124            .map_err(|e| format!("cannot read {path}: {e}"))?;
125        Ok(self.load_str(&data))
126    }
127
128    /// Merge another builder's words into this one.
129    pub fn merge(mut self, other: DictionaryBuilder) -> Self {
130        self.words.extend(other.words);
131        #[cfg(feature = "regex")]
132        self.regex_words.extend(other.regex_words);
133        self
134    }
135
136    /// Build the dictionary index.
137    pub fn build(self) -> Dictionary {
138        #[cfg(feature = "regex")]
139        {
140            Dictionary::build_from(self.words, self.key_weights, self.regex_words)
141        }
142        #[cfg(not(feature = "regex"))]
143        {
144            Dictionary::build_from(self.words, self.key_weights)
145        }
146    }
147}
148
149impl Default for DictionaryBuilder {
150    fn default() -> Self { Self::new() }
151}
152
153// ── Dictionary impl ───────────────────────────────────────────────────────────
154
155impl Dictionary {
156    /// Create a new [`DictionaryBuilder`].
157    pub fn builder() -> DictionaryBuilder { DictionaryBuilder::new() }
158
159    /// Load directly from a file path (convenience shorthand).
160    pub fn from_file(path: &str) -> Result<Self, String> {
161        DictionaryBuilder::new().load_file(path).map(|b| b.build())
162    }
163
164    /// Parse directly from a string (convenience shorthand).
165    pub fn from_text(data: &str) -> Self {
166        DictionaryBuilder::new().load_str(data).build()
167    }
168
169    /// Return category key (0–254) for `word`, or `None` if not found.
170    pub fn seek(&self, word: &str) -> Option<u8> {
171        let h = Hash::from_bytes(word.as_bytes());
172        match self.lookup(&h) {
173            255 => None,
174            k   => Some(k),
175        }
176    }
177
178    /// Return bitmask of category keys present in the dictionary.
179    /// Bit `i` is set if at least one word with `key == i` exists. Covers keys 0–127.
180    pub fn mask(&self) -> u128 { self.mask }
181
182    /// Scan `text` and return all keyword matches.
183    ///
184    /// Pass `Mode::FORBID` to include words with key < 5 (BLOCK/ALERT/FLAG/THROTTLE/LOG).
185    /// Pass `Mode::IGNORE_CASE` for case-insensitive matching (dictionary must be lowercase).
186    /// Combine flags with `|`: `Mode::HTML | Mode::FORBID | Mode::IGNORE_CASE`.
187    pub fn scan(&self, text: &str, mode: Mode) -> Vec<Match> {
188        let scan_text = if mode.contains(Mode::IGNORE_CASE) {
189            std::borrow::Cow::Owned(text.to_lowercase())
190        } else {
191            std::borrow::Cow::Borrowed(text)
192        };
193        let mut matches = self.scan_bytes(scan_text.as_bytes(), mode);
194        #[cfg(feature = "regex")]
195        self.scan_regex(&scan_text, mode, &mut matches);
196        matches.sort_unstable_by_key(|m| m.position);
197        matches
198    }
199
200    /// Replace all matched words in `text` with `'*'`.
201    ///
202    /// With `Mode::IGNORE_CASE`, matches are case-insensitive but `*` masks the original casing.
203    pub fn filter(&self, text: &str, mode: Mode) -> String {
204        let matches = self.scan(text, mode);
205        if matches.is_empty() { return text.to_string(); }
206        let mut buf = text.as_bytes().to_vec();
207        for m in &matches {
208            buf[m.position..m.position + m.length].fill(b'*');
209        }
210        String::from_utf8_lossy(&buf).into_owned()
211    }
212
213    /// Return the first keyword match in `text`, or `None` if no match.
214    ///
215    /// Faster than `scan()` when you only need to know if a match exists.
216    pub fn scan_first(&self, text: &str, mode: Mode) -> Option<Match> {
217        if mode.contains(Mode::IGNORE_CASE) {
218            self.scan_bytes_first(text.to_lowercase().as_bytes(), mode)
219        } else {
220            self.scan_bytes_first(text.as_bytes(), mode)
221        }
222    }
223
224    /// Return `true` if `text` contains at least one keyword match.
225    pub fn contains(&self, text: &str, mode: Mode) -> bool {
226        self.scan_first(text, mode).is_some()
227    }
228
229    /// Return the highest-severity (lowest key value) match in `text`, or `None`.
230    pub fn severity(&self, text: &str, mode: Mode) -> Option<Match> {
231        self.scan(text, mode).into_iter().min_by_key(|m| m.key)
232    }
233
234    /// Return the total weighted score for each key present in `text`.
235    ///
236    /// Score = per-word weight × per-key weight (both default 1.0).
237    pub fn score(&self, text: &str, mode: Mode) -> HashMap<u8, f32> {
238        self.score_inner(text, mode, None)
239    }
240
241    /// Like [`score`], but applies additional runtime key weights on top of dictionary weights.
242    ///
243    /// `runtime_weights` is a slice of `(key, multiplier)` pairs.
244    /// The multiplier stacks on top of the dictionary-defined per-key weight.
245    pub fn score_with_weights(&self, text: &str, mode: Mode, runtime_weights: &[(u8, f32)]) -> HashMap<u8, f32> {
246        self.score_inner(text, mode, Some(runtime_weights))
247    }
248
249    /// Classify `text` — returns the key with the highest weighted score, or `None`.
250    pub fn classify(&self, text: &str, mode: Mode) -> Option<ClassifyResult> {
251        self.classify_from(self.score(text, mode))
252    }
253
254    /// Like [`classify`], but applies additional runtime key weights.
255    ///
256    /// Useful when the same dictionary is reused in different contexts
257    /// with different priority tuning (e.g. stricter BLOCK weight at night).
258    pub fn classify_with_weights(&self, text: &str, mode: Mode, runtime_weights: &[(u8, f32)]) -> Option<ClassifyResult> {
259        self.classify_from(self.score_with_weights(text, mode, runtime_weights))
260    }
261
262    fn score_inner(&self, text: &str, mode: Mode, runtime_weights: Option<&[(u8, f32)]>) -> HashMap<u8, f32> {
263        let mut scores: HashMap<u8, f32> = HashMap::new();
264        for m in self.scan(text, mode) {
265            let h = Hash::from_bytes(text[m.position..m.position + m.length].as_bytes());
266            let word_w    = self.word_weights.get(&h.a).copied().unwrap_or(1.0);
267            let dict_kw   = self.key_weights[m.key as usize];
268            let runtime_kw = runtime_weights
269                .and_then(|rw| rw.iter().find(|(k, _)| *k == m.key))
270                .map(|(_, w)| *w)
271                .unwrap_or(1.0);
272            *scores.entry(m.key).or_insert(0.0) += word_w * dict_kw * runtime_kw;
273        }
274        scores
275    }
276
277    fn classify_from(&self, scores: HashMap<u8, f32>) -> Option<ClassifyResult> {
278        scores.into_iter()
279            .max_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
280            .map(|(k, score)| ClassifyResult { key: k, score })
281    }
282
283    /// Extract only matches with a specific category key.
284    pub fn scan_key(&self, text: &str, key: u8, mode: Mode) -> Vec<Match> {
285        self.scan(text, mode).into_iter().filter(|m| m.key == key).collect()
286    }
287
288    // ── regex scan ───────────────────────────────────────────────────────────
289
290    #[cfg(feature = "regex")]
291    fn scan_regex(&self, text: &str, mode: Mode, matches: &mut Vec<Match>) {
292        let Some((set, meta)) = &self.regex_patterns else { return };
293        let forbid = mode.contains(Mode::FORBID);
294        for idx in set.matches(text) {
295            let (k, _) = meta[idx];
296            if !forbid && (k as u32) < key::FORBID_THRESHOLD as u32 { continue; }
297            if let Ok(re) = regex::Regex::new(set.patterns()[idx].as_str()) {
298                for m in re.find_iter(text) {
299                    matches.push(Match { position: m.start(), length: m.len(), key: k });
300                }
301            }
302        }
303    }
304
305    // ── save / load ──────────────────────────────────────────────────────────
306
307    #[cfg(feature = "save")]
308    pub fn save(&self) -> Result<Vec<u8>, String> {
309        let snapshot = DictSnapshot::from_dict(self);
310        postcard::to_allocvec(&snapshot).map_err(|e| e.to_string())
311    }
312
313    #[cfg(feature = "save")]
314    pub fn save_to_file(&self, path: &str) -> Result<(), String> {
315        let bytes = self.save()?;
316        fs::write(path, bytes).map_err(|e| format!("cannot write {path}: {e}"))
317    }
318
319    #[cfg(feature = "save")]
320    pub fn load(bytes: &[u8]) -> Result<Self, String> {
321        let snapshot: DictSnapshot = postcard::from_bytes(bytes).map_err(|e| e.to_string())?;
322        Ok(snapshot.into_dict())
323    }
324
325    #[cfg(feature = "save")]
326    pub fn load_from_file(path: &str) -> Result<Self, String> {
327        let bytes = fs::read(path).map_err(|e| format!("cannot read {path}: {e}"))?;
328        Self::load(&bytes)
329    }
330
331    // ── internal build ────────────────────────────────────────────────────────
332
333    #[cfg(feature = "regex")]
334    fn build_from(words: Vec<(String, u8, f32)>, key_weights: [f32; 256], regex_words: Vec<(String, u8, f32)>) -> Self {
335        let mut dict = Self::build_hash(words, key_weights);
336        if !regex_words.is_empty() {
337            let patterns: Vec<&str> = regex_words.iter().map(|(p, _, _)| p.as_str()).collect();
338            let meta: Vec<(u8, f32)> = regex_words.iter().map(|(_, k, w)| (*k, *w)).collect();
339            if let Ok(set) = RegexSet::new(&patterns) {
340                dict.regex_patterns = Some((set, meta));
341            }
342        }
343        dict
344    }
345
346    #[cfg(not(feature = "regex"))]
347    fn build_from(words: Vec<(String, u8, f32)>, key_weights: [f32; 256]) -> Self {
348        Self::build_hash(words, key_weights)
349    }
350
351    fn build_hash(words: Vec<(String, u8, f32)>, key_weights: [f32; 256]) -> Self {
352        let mut hashes: Vec<Hash> = Vec::new();
353        let mut mask: u128 = 0;
354        let mut word_weights: HashMap<u64, f32> = HashMap::new();
355
356        let max_word_len = words.iter().map(|(w, _, _)| w.len()).max().unwrap_or(16).max(16);
357
358        for (word, key, weight) in &words {
359            let bytes = word.as_bytes();
360            let mut tmp = bytes.to_vec();
361            let mut j = bytes.len();
362            while j > 16 {
363                j = (j - 1) & !0xf;
364                tmp[j] = 0;
365                let mut h = Hash::from_bytes(&tmp);
366                h.f = 255;
367                hashes.push(h);
368                tmp[j] = bytes[j];
369            }
370            let mut h = Hash::from_bytes(bytes);
371            h.f = *key;
372            word_weights.insert(h.a, *weight);
373            hashes.push(h);
374            if (*key as u32) < 128 { mask |= 1u128 << key; }
375        }
376
377        let msize = hashes.len();
378        if msize == 0 {
379            return Dictionary {
380                bcount: 0, blocks: vec![], entries: vec![],
381                mask: 0, word_weights, key_weights, max_word_len,
382                #[cfg(feature = "regex")]
383                regex_patterns: None,
384            };
385        }
386        let bcount = msize.div_ceil(BSIZE);
387
388        hashes.sort_by(|a, b| {
389            let ba = (a.a % bcount as u64) as usize;
390            let bb = (b.a % bcount as u64) as usize;
391            ba.cmp(&bb)
392                .then(a.b_masked().cmp(&b.b_masked()))
393                .then(a.f.cmp(&b.f))
394        });
395        hashes.dedup_by(|a, b| a.a == b.a && a.b == b.b);
396
397        let mut blocks = vec![(0usize, 0usize); bcount];
398        let mut entries: Vec<(u64, u8)> = Vec::with_capacity(hashes.len());
399        let mut cur_blk = usize::MAX;
400
401        for h in &hashes {
402            let blk = (h.a % bcount as u64) as usize;
403            if blk != cur_blk {
404                cur_blk = blk;
405                blocks[blk].0 = entries.len();
406            }
407            blocks[blk].1 += 1;
408            entries.push((h.b_masked(), h.f));
409        }
410
411        Dictionary {
412            bcount, blocks, entries, mask, word_weights, key_weights, max_word_len,
413            #[cfg(feature = "regex")]
414            regex_patterns: None,
415        }
416    }
417
418    // ── internal lookup ───────────────────────────────────────────────────────
419
420    fn lookup(&self, h: &Hash) -> u8 {
421        if self.bcount == 0 { return 255; }
422        let blk = (h.a % self.bcount as u64) as usize;
423        let (start, count) = self.blocks[blk];
424        let target = h.b_masked();
425        let slice = &self.entries[start..start + count];
426        match slice.binary_search_by_key(&target, |&(hb, _)| hb) {
427            Ok(i)  => slice[i].1,
428            Err(_) => 255,
429        }
430    }
431
432    // ── scan ─────────────────────────────────────────────────────────────────
433
434    fn scan_bytes_first(&self, s: &[u8], mode: Mode) -> Option<Match> {
435        // Identical window logic to scan_bytes, but returns on the first accepted hit.
436        let size = s.len();
437        if size == 0 || self.bcount == 0 { return None; }
438
439        let html    = mode.contains(Mode::HTML);
440        let forbid  = mode.contains(Mode::FORBID);
441        let english = mode.contains(Mode::ENGLISH);
442
443        let mut w = vec![0u8; size];
444        let mut v = vec![0u8; size];
445        let mut hashes = vec![Hash::default(); 256];
446        let mut active = vec![false; 256];
447
448        let mut i = 0;
449        while i < size {
450            if html && s[i] == b'<' {
451                i = skip_tag(s, i + 1);
452                continue;
453            }
454
455            let slot = i & 0xff;
456            hashes[slot] = Hash::default();
457            active[slot] = true;
458
459            for j in 0..256usize {
460                if !active[j] { continue; }
461                hashes[j].feed(s[i]);
462                let r = self.lookup(&hashes[j]);
463
464                if i.wrapping_sub(j) >= self.max_word_len && r == 255 {
465                    active[j] = false;
466                }
467
468                if r != 255 {
469                    let start = i.wrapping_sub(i.wrapping_sub(j) & 0xff);
470                    if english {
471                        let ok_before = start == 0 || !is_word_char(s[start - 1]);
472                        let ok_after  = i + 1 >= size || !is_word_char(s[i + 1]);
473                        if !ok_before || !ok_after { continue; }
474                    }
475                    w[start] = (i - j + 1) as u8;
476                    v[start] = r;
477                }
478            }
479            i += 1;
480        }
481
482        // Walk results in order; return the first accepted match.
483        let mut i = 0;
484        while i < size {
485            if w[i] != 0 {
486                let k = v[i];
487                if forbid || k >= key::FORBID_THRESHOLD {
488                    return Some(Match { position: i, length: w[i] as usize, key: k });
489                }
490                i += (w[i] - 1) as usize;
491            }
492            i += 1;
493        }
494        None
495    }
496
497    fn scan_bytes(&self, s: &[u8], mode: Mode) -> Vec<Match> {
498        let size = s.len();
499        if size == 0 || self.bcount == 0 { return vec![]; }
500
501        let html    = mode.contains(Mode::HTML);
502        let forbid  = mode.contains(Mode::FORBID);
503        let english = mode.contains(Mode::ENGLISH);
504
505        let mut w = vec![0u8; size];
506        let mut v = vec![0u8; size];
507        let mut hashes = vec![Hash::default(); 256];
508        let mut active = vec![false; 256];
509
510        let mut i = 0;
511        while i < size {
512            if html && s[i] == b'<' {
513                i = skip_tag(s, i + 1);
514                continue;
515            }
516
517            let slot = i & 0xff;
518            hashes[slot] = Hash::default();
519            active[slot] = true;
520
521            for j in 0..256usize {
522                if !active[j] { continue; }
523                hashes[j].feed(s[i]);
524                let r = self.lookup(&hashes[j]);
525
526                if i.wrapping_sub(j) >= self.max_word_len && r == 255 {
527                    active[j] = false;
528                }
529
530                if r != 255 {
531                    let start = i.wrapping_sub(i.wrapping_sub(j) & 0xff);
532                    if english {
533                        let ok_before = start == 0 || !is_word_char(s[start - 1]);
534                        let ok_after  = i + 1 >= size || !is_word_char(s[i + 1]);
535                        if !ok_before || !ok_after { continue; }
536                    }
537                    w[start] = (i - j + 1) as u8;
538                    v[start] = r;
539                }
540            }
541            i += 1;
542        }
543
544        let mut results = Vec::new();
545        let mut i = 0;
546        while i < size {
547            if w[i] != 0 {
548                let k = v[i];
549                if forbid || k >= key::FORBID_THRESHOLD {
550                    results.push(Match { position: i, length: w[i] as usize, key: k });
551                }
552                i += (w[i] - 1) as usize;
553            }
554            i += 1;
555        }
556        results
557    }
558}
559
560fn skip_tag(s: &[u8], mut i: usize) -> usize {
561    while i < s.len() && s[i] != b'>' {
562        let q = s[i];
563        if q == b'"' || q == b'\'' {
564            i += 1;
565            while i < s.len() && s[i] != q { i += 1; }
566        }
567        i += 1;
568    }
569    i + 1
570}
571
572fn is_word_char(c: u8) -> bool {
573    c.is_ascii_alphanumeric() || c == b'_' || c == b'@'
574}
575
576// ── DictSnapshot (postcard serialization) ────────────────────────────────────
577
578#[cfg(feature = "save")]
579#[derive(Serialize, Deserialize)]
580struct DictSnapshot {
581    bcount:       usize,
582    blocks:       Vec<(usize, usize)>,
583    entries:      Vec<(u64, u8)>,
584    mask:         u128,
585    word_weights: Vec<(u64, f32)>,
586    key_weights:  Vec<f32>,
587    max_word_len: usize,
588    /// Regex patterns stored as (pattern_string, key, weight) for re-compilation on load.
589    regex_patterns: Vec<(String, u8, f32)>,
590}
591
592#[cfg(feature = "save")]
593impl DictSnapshot {
594    fn from_dict(d: &Dictionary) -> Self {
595        DictSnapshot {
596            bcount:       d.bcount,
597            blocks:       d.blocks.clone(),
598            entries:      d.entries.clone(),
599            mask:         d.mask,
600            word_weights: d.word_weights.iter().map(|(&k, &v)| (k, v)).collect(),
601            key_weights:  d.key_weights.to_vec(),
602            max_word_len: d.max_word_len,
603            regex_patterns: {
604                #[cfg(feature = "regex")]
605                {
606                    d.regex_patterns.as_ref().map(|(set, meta)| {
607                        set.patterns().iter().zip(meta.iter())
608                            .map(|(p, &(k, w))| (p.clone(), k, w))
609                            .collect()
610                    }).unwrap_or_default()
611                }
612                #[cfg(not(feature = "regex"))]
613                { vec![] }
614            },
615        }
616    }
617
618    fn into_dict(self) -> Dictionary {
619        let word_weights: HashMap<u64, f32> = self.word_weights.into_iter().collect();
620        let mut key_weights = [1.0f32; 256];
621        for (i, w) in self.key_weights.iter().enumerate().take(256) {
622            key_weights[i] = *w;
623        }
624        Dictionary {
625            bcount:       self.bcount,
626            blocks:       self.blocks,
627            entries:      self.entries,
628            mask:         self.mask,
629            word_weights,
630            key_weights,
631            max_word_len: self.max_word_len,
632            #[cfg(feature = "regex")]
633            regex_patterns: if self.regex_patterns.is_empty() {
634                None
635            } else {
636                let patterns: Vec<&str> = self.regex_patterns.iter().map(|(p, _, _)| p.as_str()).collect();
637                let meta: Vec<(u8, f32)> = self.regex_patterns.iter().map(|(_, k, w)| (*k, *w)).collect();
638                RegexSet::new(&patterns).ok().map(|set| (set, meta))
639            },
640        }
641    }
642}
643
644// ── tests ─────────────────────────────────────────────────────────────────────
645
646#[cfg(test)]
647mod tests {
648    use super::*;
649    use crate::{key, Mode};
650
651    // ── fixtures ──────────────────────────────────────────────────────────────
652
653    /// Action-oriented dictionary (案2 keys)
654    fn action_dict() -> Dictionary {
655        Dictionary::builder()
656            .add("shutdown",        key::BLOCK)
657            .add("crash",           key::BLOCK)
658            .add("disk_full",       key::ALERT)
659            .add("oom",             key::ALERT)
660            .add("deprecated_api",  key::FLAG)
661            .add("slow_query",      key::THROTTLE)
662            .add("retry",           key::THROTTLE)
663            .add("user_login",      key::LOG)
664            .add("health_check",    key::PASS)
665            .add("ping",            key::PASS)
666            .build()
667    }
668
669    /// C-version compatible dictionary (keys 0-14)
670    fn compat_dict() -> Dictionary {
671        Dictionary::from_text(
672            "apple\t9\nspam\t2\nadult_word\t1\nfree\t2\nprize\t2\n"
673        )
674    }
675
676    // ── seek ─────────────────────────────────────────────────────────────────
677
678    #[test]
679    fn seek_block() {
680        let d = action_dict();
681        assert_eq!(d.seek("shutdown"), Some(key::BLOCK));
682        assert_eq!(d.seek("crash"),    Some(key::BLOCK));
683    }
684
685    #[test]
686    fn seek_alert() {
687        let d = action_dict();
688        assert_eq!(d.seek("disk_full"), Some(key::ALERT));
689        assert_eq!(d.seek("oom"),       Some(key::ALERT));
690    }
691
692    #[test]
693    fn seek_flag() {
694        let d = action_dict();
695        assert_eq!(d.seek("deprecated_api"), Some(key::FLAG));
696    }
697
698    #[test]
699    fn seek_throttle() {
700        let d = action_dict();
701        assert_eq!(d.seek("slow_query"), Some(key::THROTTLE));
702        assert_eq!(d.seek("retry"),      Some(key::THROTTLE));
703    }
704
705    #[test]
706    fn seek_log() {
707        let d = action_dict();
708        assert_eq!(d.seek("user_login"), Some(key::LOG));
709    }
710
711    #[test]
712    fn seek_pass() {
713        let d = action_dict();
714        assert_eq!(d.seek("health_check"), Some(key::PASS));
715        assert_eq!(d.seek("ping"),         Some(key::PASS));
716    }
717
718    #[test]
719    fn seek_not_found() {
720        let d = action_dict();
721        assert_eq!(d.seek("unknown_event"), None);
722        assert_eq!(d.seek(""),              None);
723    }
724
725    #[test]
726    fn seek_case_sensitive() {
727        let d = action_dict();
728        // iword hash is case-sensitive
729        assert_eq!(d.seek("Shutdown"), None);
730        assert_eq!(d.seek("SHUTDOWN"), None);
731    }
732
733    // ── scan: forbid flag ─────────────────────────────────────────────────────
734
735    #[test]
736    fn scan_without_forbid_skips_actionable_keys() {
737        let d = action_dict();
738        // BLOCK(0), ALERT(1), FLAG(2), THROTTLE(3), LOG(4) are all < FORBID_THRESHOLD(5)
739        let text = "shutdown disk_full deprecated_api slow_query user_login health_check";
740        let m = d.scan(text, Mode::default());
741        // Only PASS(5) and above should appear without FORBID
742        assert!(m.iter().all(|x| x.key >= key::FORBID_THRESHOLD));
743        assert!(m.iter().any(|x| x.key == key::PASS));
744    }
745
746    #[test]
747    fn scan_with_forbid_returns_all() {
748        let d = action_dict();
749        let text = "shutdown disk_full deprecated_api slow_query user_login health_check";
750        let m = d.scan(text, Mode::FORBID);
751        let keys: Vec<u8> = m.iter().map(|x| x.key).collect();
752        assert!(keys.contains(&key::BLOCK));
753        assert!(keys.contains(&key::ALERT));
754        assert!(keys.contains(&key::FLAG));
755        assert!(keys.contains(&key::THROTTLE));
756        assert!(keys.contains(&key::LOG));
757        assert!(keys.contains(&key::PASS));
758    }
759
760    #[test]
761    fn scan_empty_text() {
762        let d = action_dict();
763        assert!(d.scan("", Mode::FORBID).is_empty());
764    }
765
766    #[test]
767    fn scan_no_match() {
768        let d = action_dict();
769        assert!(d.scan("everything is fine today", Mode::FORBID).is_empty());
770    }
771
772    // ── scan: position and length ─────────────────────────────────────────────
773
774    #[test]
775    fn scan_position_and_length() {
776        let d = action_dict();
777        let text = "system shutdown detected";
778        let m = d.scan(text, Mode::FORBID);
779        let hit = m.iter().find(|x| x.key == key::BLOCK).expect("shutdown not found");
780        assert_eq!(hit.extract(text), "shutdown");
781        assert_eq!(hit.position, 7);
782        assert_eq!(hit.length, 8);
783    }
784
785    #[test]
786    fn scan_multiple_matches_ordered() {
787        let d = action_dict();
788        let text = "crash then disk_full";
789        let m = d.scan(text, Mode::FORBID);
790        assert!(m.len() >= 2);
791        // positions should be ascending
792        let positions: Vec<usize> = m.iter().map(|x| x.position).collect();
793        assert!(positions.windows(2).all(|w| w[0] < w[1]));
794    }
795
796    #[test]
797    fn scan_match_at_start() {
798        let d = action_dict();
799        let text = "shutdown now";
800        let m = d.scan(text, Mode::FORBID);
801        assert!(!m.is_empty());
802        assert_eq!(m[0].position, 0);
803    }
804
805    #[test]
806    fn scan_match_at_end() {
807        let d = action_dict();
808        let text = "system crash";
809        let m = d.scan(text, Mode::FORBID);
810        let hit = m.iter().find(|x| x.key == key::BLOCK).expect("crash not found");
811        assert_eq!(hit.extract(text), "crash");
812        assert_eq!(hit.position + hit.length, text.len());
813    }
814
815    // ── scan: HTML mode ───────────────────────────────────────────────────────
816
817    #[test]
818    fn scan_html_skips_tags() {
819        let d = action_dict();
820        // "shutdown" inside a tag attribute should be skipped
821        let text = r#"<meta name="shutdown"> disk_full occurred"#;
822        let m = d.scan(text, Mode::HTML | Mode::FORBID);
823        assert!(m.iter().all(|x| x.key != key::BLOCK), "shutdown inside tag should be skipped");
824        assert!(m.iter().any(|x| x.key == key::ALERT));
825    }
826
827    #[test]
828    fn scan_html_finds_text_content() {
829        let d = action_dict();
830        let text = "<p>system crash detected</p>";
831        let m = d.scan(text, Mode::HTML | Mode::FORBID);
832        assert!(m.iter().any(|x| x.key == key::BLOCK));
833    }
834
835    // ── filter ────────────────────────────────────────────────────────────────
836
837    #[test]
838    fn filter_replaces_with_stars() {
839        let d = action_dict();
840        let out = d.filter("system shutdown detected", Mode::FORBID);
841        assert!(!out.contains("shutdown"));
842        assert!(out.contains('*'));
843        assert_eq!(out.len(), "system shutdown detected".len());
844    }
845
846    #[test]
847    fn filter_clean_text_unchanged() {
848        let d = action_dict();
849        let text = "everything is running smoothly";
850        assert_eq!(d.filter(text, Mode::FORBID), text);
851    }
852
853    #[test]
854    fn filter_multiple_words() {
855        let d = action_dict();
856        let out = d.filter("crash and disk_full", Mode::FORBID);
857        assert!(!out.contains("crash"));
858        assert!(!out.contains("disk_full"));
859        assert_eq!(out.chars().filter(|&c| c == '*').count(),
860                   "crash".len() + "disk_full".len());
861    }
862
863    // ── scan_key ─────────────────────────────────────────────────────────────
864
865    #[test]
866    fn scan_key_returns_only_requested_key() {
867        let d = action_dict();
868        let text = "crash disk_full deprecated_api slow_query health_check";
869        let blocks = d.scan_key(text, key::BLOCK, Mode::FORBID);
870        assert!(blocks.iter().all(|x| x.key == key::BLOCK));
871        assert!(!blocks.is_empty());
872    }
873
874    #[test]
875    fn scan_key_empty_when_no_match() {
876        let d = action_dict();
877        assert!(d.scan_key("health_check ping", key::BLOCK, Mode::FORBID).is_empty());
878    }
879
880    // ── builder ───────────────────────────────────────────────────────────────
881
882    #[test]
883    fn builder_add_many() {
884        let d = Dictionary::builder()
885            .add_many(&["crash", "panic"], key::BLOCK)
886            .add_many(&["warn", "slow"],   key::THROTTLE)
887            .add("ok", key::PASS)
888            .build();
889        assert_eq!(d.seek("crash"), Some(key::BLOCK));
890        assert_eq!(d.seek("panic"), Some(key::BLOCK));
891        assert_eq!(d.seek("warn"),  Some(key::THROTTLE));
892        assert_eq!(d.seek("ok"),    Some(key::PASS));
893    }
894
895    #[test]
896    fn builder_merge() {
897        let security = Dictionary::builder()
898            .add("shutdown", key::BLOCK)
899            .add("breach",   key::ALERT);
900        let perf = Dictionary::builder()
901            .add("slow_query", key::THROTTLE)
902            .add("timeout",    key::FLAG);
903        let d = security.merge(perf).build();
904        assert_eq!(d.seek("shutdown"),   Some(key::BLOCK));
905        assert_eq!(d.seek("breach"),     Some(key::ALERT));
906        assert_eq!(d.seek("slow_query"), Some(key::THROTTLE));
907        assert_eq!(d.seek("timeout"),    Some(key::FLAG));
908    }
909
910    #[test]
911    fn builder_empty() {
912        let d = Dictionary::builder().build();
913        assert_eq!(d.seek("anything"), None);
914        assert!(d.scan("anything", Mode::FORBID).is_empty());
915    }
916
917    // ── key range ─────────────────────────────────────────────────────────────
918
919    #[test]
920    fn key_full_u8_range() {
921        let d = Dictionary::builder()
922            .add("low",  0u8)
923            .add("mid",  100u8)
924            .add("high", 254u8)
925            .build();
926        assert_eq!(d.seek("low"),  Some(0));
927        assert_eq!(d.seek("mid"),  Some(100));
928        assert_eq!(d.seek("high"), Some(254));
929    }
930
931    #[test]
932    fn load_str_decimal_key() {
933        let d = Dictionary::from_text("critical_event\t20\nbulk_import\t100\n");
934        assert_eq!(d.seek("critical_event"), Some(20));
935        assert_eq!(d.seek("bulk_import"),    Some(100));
936    }
937
938    #[test]
939    fn load_str_hex_key_compat() {
940        // single-char hex for C-version compatibility
941        let d = Dictionary::from_text("spam_word\t2\nadult_word\t1\napple\t9\n");
942        assert_eq!(d.seek("spam_word"),  Some(2));
943        assert_eq!(d.seek("adult_word"), Some(1));
944        assert_eq!(d.seek("apple"),      Some(9));
945    }
946
947    #[test]
948    fn load_str_default_key() {
949        // no tab = default key (LOG=4? No — DEFAULT=9 for compat)
950        let d = Dictionary::from_text("someword\n");
951        assert_eq!(d.seek("someword"), Some(key::DEFAULT));
952    }
953
954    // ── mask ─────────────────────────────────────────────────────────────────
955
956    #[test]
957    fn mask_reflects_loaded_keys() {
958        let d = action_dict();
959        let m = d.mask();
960        assert!(m & (1u128 << key::BLOCK)    != 0);
961        assert!(m & (1u128 << key::ALERT)    != 0);
962        assert!(m & (1u128 << key::FLAG)     != 0);
963        assert!(m & (1u128 << key::THROTTLE) != 0);
964        assert!(m & (1u128 << key::LOG)      != 0);
965        assert!(m & (1u128 << key::PASS)     != 0);
966    }
967
968    #[test]
969    fn mask_empty_dict() {
970        let d = Dictionary::builder().build();
971        assert_eq!(d.mask(), 0);
972    }
973
974    // ── scan_first / contains / severity ─────────────────────────────────────
975
976    #[test]
977    fn scan_first_returns_first_match() {
978        let d = action_dict();
979        let text = "crash then disk_full";
980        let m = d.scan_first(text, Mode::FORBID).expect("should match");
981        assert_eq!(m.extract(text), "crash");
982    }
983
984    #[test]
985    fn scan_first_none_on_no_match() {
986        let d = action_dict();
987        assert!(d.scan_first("everything is fine", Mode::FORBID).is_none());
988    }
989
990    #[test]
991    fn scan_first_respects_forbid() {
992        let d = action_dict();
993        // shutdown is BLOCK(0), requires FORBID
994        assert!(d.scan_first("shutdown", Mode::default()).is_none());
995        assert!(d.scan_first("shutdown", Mode::FORBID).is_some());
996    }
997
998    #[test]
999    fn contains_true_on_match() {
1000        let d = action_dict();
1001        assert!(d.contains("system crash detected", Mode::FORBID));
1002    }
1003
1004    #[test]
1005    fn contains_false_on_no_match() {
1006        let d = action_dict();
1007        assert!(!d.contains("all systems nominal", Mode::FORBID));
1008    }
1009
1010    #[test]
1011    fn severity_returns_lowest_key() {
1012        let d = action_dict();
1013        // crash=BLOCK(0), disk_full=ALERT(1) — severity should return BLOCK
1014        let text = "disk_full and crash occurred";
1015        let m = d.severity(text, Mode::FORBID).expect("should match");
1016        assert_eq!(m.key, key::BLOCK);
1017        assert_eq!(m.extract(text), "crash");
1018    }
1019
1020    #[test]
1021    fn severity_none_on_no_match() {
1022        let d = action_dict();
1023        assert!(d.severity("all clear", Mode::FORBID).is_none());
1024    }
1025
1026    // ── C-version compat ──────────────────────────────────────────────────────
1027
1028    #[test]
1029    fn compat_seek() {
1030        let d = compat_dict();
1031        assert_eq!(d.seek("apple"), Some(9));
1032        assert_eq!(d.seek("spam"),  Some(2));
1033        assert_eq!(d.seek("adult_word"), Some(1));
1034        assert_eq!(d.seek("notaword"),   None);
1035    }
1036
1037    // ── classify / score / weight ─────────────────────────────────────────────
1038
1039    #[test]
1040    fn classify_returns_dominant_key() {
1041        let d = action_dict();
1042        // 2× BLOCK words vs 1× ALERT word → BLOCK wins
1043        let text = "crash and shutdown cause disk_full";
1044        let r = d.classify(text, Mode::FORBID).expect("should classify");
1045        assert_eq!(r.key, key::BLOCK);
1046        assert!((r.score - 2.0).abs() < 0.01);
1047    }
1048
1049    #[test]
1050    fn classify_none_on_no_match() {
1051        let d = action_dict();
1052        assert!(d.classify("all clear nothing here", Mode::FORBID).is_none());
1053    }
1054
1055    #[test]
1056    fn score_returns_per_key_scores() {
1057        let d = action_dict();
1058        let text = "crash disk_full slow_query";
1059        let scores = d.score(text, Mode::FORBID);
1060        assert!((scores[&key::BLOCK]    - 1.0).abs() < 0.01);
1061        assert!((scores[&key::ALERT]    - 1.0).abs() < 0.01);
1062        assert!((scores[&key::THROTTLE] - 1.0).abs() < 0.01);
1063    }
1064
1065    #[test]
1066    fn word_weight_affects_score() {
1067        let d = Dictionary::builder()
1068            .add_weighted("critical_crash", key::BLOCK, 5.0)
1069            .add_weighted("minor_issue",    key::BLOCK, 1.0)
1070            .build();
1071        let scores = d.score("critical_crash and minor_issue", Mode::FORBID);
1072        assert!((scores[&key::BLOCK] - 6.0).abs() < 0.01);
1073    }
1074
1075    #[test]
1076    fn key_weight_affects_score() {
1077        let d = Dictionary::builder()
1078            .add("crash",     key::BLOCK)
1079            .add("slow_query", key::THROTTLE)
1080            .set_key_weight(key::BLOCK, 10.0)
1081            .build();
1082        let scores = d.score("crash slow_query", Mode::FORBID);
1083        assert!((scores[&key::BLOCK]    - 10.0).abs() < 0.01);
1084        assert!((scores[&key::THROTTLE] -  1.0).abs() < 0.01);
1085    }
1086
1087    #[test]
1088    fn load_str_with_weight() {
1089        let d = Dictionary::from_text("shutdown\t0\t5.0\ndisk_full\t1\t2.0\nping\t5\n");
1090        let scores = d.score("shutdown disk_full ping", Mode::FORBID);
1091        assert!((scores[&key::BLOCK] - 5.0).abs() < 0.01);
1092        assert!((scores[&key::ALERT] - 2.0).abs() < 0.01);
1093        assert!((scores[&key::PASS]  - 1.0).abs() < 0.01);
1094    }
1095
1096    #[test]
1097    fn classify_with_weights_changes_winner() {
1098        let d = Dictionary::builder()
1099            .add("crash",      key::BLOCK)
1100            .add("crash",      key::BLOCK)
1101            .add("slow_query", key::THROTTLE)
1102            .build();
1103        // Without runtime weights: BLOCK(2.0) wins over THROTTLE(1.0)
1104        let text = "crash crash slow_query";
1105        let r = d.classify(text, Mode::FORBID).unwrap();
1106        assert_eq!(r.key, key::BLOCK);
1107
1108        // With runtime weights: boost THROTTLE 5× → THROTTLE(5.0) beats BLOCK(2.0)
1109        let r2 = d.classify_with_weights(text, Mode::FORBID, &[(key::THROTTLE, 5.0)]).unwrap();
1110        assert_eq!(r2.key, key::THROTTLE);
1111        assert!((r2.score - 5.0).abs() < 0.01);
1112    }
1113
1114    #[test]
1115    fn score_with_weights_applies_runtime_multiplier() {
1116        let d = action_dict();
1117        let text = "crash disk_full";
1118        let scores = d.score_with_weights(text, Mode::FORBID, &[(key::ALERT, 3.0)]);
1119        assert!((scores[&key::BLOCK] - 1.0).abs() < 0.01);  // no multiplier
1120        assert!((scores[&key::ALERT] - 3.0).abs() < 0.01);  // ×3
1121    }
1122
1123    #[test]
1124    fn compat_scan_forbid() {
1125        let d = compat_dict();
1126        let m = d.scan("get free prize now", Mode::HTML | Mode::FORBID);
1127        assert!(!m.is_empty());
1128        assert!(m.iter().any(|x| x.key == 2));
1129    }
1130}