Skip to main content

oxihuman_core/
aho_corasick.rs

1// Copyright (C) 2026 COOLJAPAN OU (Team KitaSan)
2// SPDX-License-Identifier: Apache-2.0
3#![allow(dead_code)]
4
5//! Aho-Corasick multi-pattern string search stub.
6
7use std::collections::{HashMap, VecDeque};
8
9/// A node in the Aho-Corasick automaton.
10#[derive(Debug, Default, Clone)]
11pub struct AcNode {
12    pub children: HashMap<u8, usize>,
13    pub fail: usize,
14    /// Output: indices into the pattern list.
15    pub output: Vec<usize>,
16}
17
18/// The Aho-Corasick automaton.
19pub struct AhoCorasick {
20    pub nodes: Vec<AcNode>,
21    pub patterns: Vec<Vec<u8>>,
22}
23
24impl AhoCorasick {
25    fn new() -> Self {
26        AhoCorasick {
27            nodes: vec![AcNode::default()],
28            patterns: Vec::new(),
29        }
30    }
31}
32
33/// Create a new (empty) Aho-Corasick automaton.
34pub fn new_aho_corasick() -> AhoCorasick {
35    AhoCorasick::new()
36}
37
38/// Add a pattern to the automaton (must call `ac_build` before searching).
39pub fn ac_add_pattern(ac: &mut AhoCorasick, pattern: &[u8]) {
40    let id = ac.patterns.len();
41    ac.patterns.push(pattern.to_vec());
42    let mut cur = 0;
43    for &b in pattern {
44        if !ac.nodes[cur].children.contains_key(&b) {
45            let next = ac.nodes.len();
46            ac.nodes.push(AcNode::default());
47            ac.nodes[cur].children.insert(b, next);
48        }
49        cur = ac.nodes[cur].children[&b];
50    }
51    ac.nodes[cur].output.push(id);
52}
53
54/// Build failure links (BFS over the trie).
55pub fn ac_build(ac: &mut AhoCorasick) {
56    let mut queue = VecDeque::new();
57    /* root's children have fail = 0 */
58    let root_children: Vec<(u8, usize)> =
59        ac.nodes[0].children.iter().map(|(&b, &v)| (b, v)).collect();
60    for (_, child) in &root_children {
61        ac.nodes[*child].fail = 0;
62        queue.push_back(*child);
63    }
64    while let Some(u) = queue.pop_front() {
65        let children: Vec<(u8, usize)> =
66            ac.nodes[u].children.iter().map(|(&b, &v)| (b, v)).collect();
67        for (b, v) in children {
68            let mut f = ac.nodes[u].fail;
69            loop {
70                if ac.nodes[f].children.contains_key(&b) {
71                    f = ac.nodes[f].children[&b];
72                    break;
73                }
74                if f == 0 {
75                    break;
76                }
77                f = ac.nodes[f].fail;
78            }
79            /* avoid self-loop at root */
80            if f == v {
81                f = 0;
82            }
83            ac.nodes[v].fail = f;
84            let fail_out = ac.nodes[f].output.clone();
85            ac.nodes[v].output.extend(fail_out);
86            queue.push_back(v);
87        }
88    }
89}
90
91/// A search match: (start_pos, pattern_index).
92#[derive(Debug, Clone, Copy, PartialEq, Eq)]
93pub struct AcMatch {
94    pub start: usize,
95    pub pattern_id: usize,
96}
97
98/// Search `text` for all pattern occurrences. Returns matches in order.
99pub fn ac_search(ac: &AhoCorasick, text: &[u8]) -> Vec<AcMatch> {
100    let mut cur = 0;
101    let mut results = Vec::new();
102
103    for (i, &b) in text.iter().enumerate() {
104        /* follow fail links until a transition exists or we reach root */
105        loop {
106            if ac.nodes[cur].children.contains_key(&b) {
107                cur = ac.nodes[cur].children[&b];
108                break;
109            }
110            if cur == 0 {
111                break;
112            }
113            cur = ac.nodes[cur].fail;
114        }
115        for &pid in &ac.nodes[cur].output {
116            let pat_len = ac.patterns[pid].len();
117            let start = i + 1 - pat_len;
118            results.push(AcMatch {
119                start,
120                pattern_id: pid,
121            });
122        }
123    }
124    results
125}
126
127/// Return the number of patterns.
128pub fn ac_pattern_count(ac: &AhoCorasick) -> usize {
129    ac.patterns.len()
130}
131
132/// Return `true` if any pattern appears in `text`.
133pub fn ac_contains(ac: &AhoCorasick, text: &[u8]) -> bool {
134    !ac_search(ac, text).is_empty()
135}
136
137#[cfg(test)]
138mod tests {
139    use super::*;
140
141    fn build(patterns: &[&str]) -> AhoCorasick {
142        let mut ac = new_aho_corasick();
143        for p in patterns {
144            ac_add_pattern(&mut ac, p.as_bytes());
145        }
146        ac_build(&mut ac);
147        ac
148    }
149
150    #[test]
151    fn test_single_pattern_found() {
152        let ac = build(&["he"]);
153        let matches = ac_search(&ac, b"ahebcd");
154        assert!(!matches.is_empty());
155        assert_eq!(matches[0].start, 1);
156    }
157
158    #[test]
159    fn test_multiple_patterns() {
160        let ac = build(&["he", "she", "his", "hers"]);
161        let matches = ac_search(&ac, b"ushers");
162        assert!(!matches.is_empty());
163    }
164
165    #[test]
166    fn test_no_match() {
167        let ac = build(&["xyz"]);
168        let matches = ac_search(&ac, b"hello world");
169        assert!(matches.is_empty());
170    }
171
172    #[test]
173    fn test_pattern_count() {
174        let ac = build(&["a", "b", "c"]);
175        assert_eq!(ac_pattern_count(&ac), 3);
176    }
177
178    #[test]
179    fn test_contains_true() {
180        let ac = build(&["world"]);
181        assert!(ac_contains(&ac, b"hello world"));
182    }
183
184    #[test]
185    fn test_contains_false() {
186        let ac = build(&["moon"]);
187        assert!(!ac_contains(&ac, b"hello world"));
188    }
189
190    #[test]
191    fn test_overlapping_patterns() {
192        /* "a" appears inside "aa" */
193        let ac = build(&["a", "aa"]);
194        let m = ac_search(&ac, b"aaa");
195        assert!(m.len() >= 3); /* at least three "a" matches */
196    }
197
198    #[test]
199    fn test_empty_text() {
200        let ac = build(&["abc"]);
201        assert!(ac_search(&ac, b"").is_empty());
202    }
203
204    #[test]
205    fn test_pattern_at_end() {
206        let ac = build(&["end"]);
207        let m = ac_search(&ac, b"the end");
208        assert!(!m.is_empty());
209        assert_eq!(m[0].start, 4);
210    }
211}