oxihuman_core/
aho_corasick.rs1#![allow(dead_code)]
4
5use std::collections::{HashMap, VecDeque};
8
9#[derive(Debug, Default, Clone)]
11pub struct AcNode {
12 pub children: HashMap<u8, usize>,
13 pub fail: usize,
14 pub output: Vec<usize>,
16}
17
18pub struct AhoCorasick {
20 pub nodes: Vec<AcNode>,
21 pub patterns: Vec<Vec<u8>>,
22}
23
24impl AhoCorasick {
25 fn new() -> Self {
26 AhoCorasick {
27 nodes: vec![AcNode::default()],
28 patterns: Vec::new(),
29 }
30 }
31}
32
33pub fn new_aho_corasick() -> AhoCorasick {
35 AhoCorasick::new()
36}
37
38pub fn ac_add_pattern(ac: &mut AhoCorasick, pattern: &[u8]) {
40 let id = ac.patterns.len();
41 ac.patterns.push(pattern.to_vec());
42 let mut cur = 0;
43 for &b in pattern {
44 if !ac.nodes[cur].children.contains_key(&b) {
45 let next = ac.nodes.len();
46 ac.nodes.push(AcNode::default());
47 ac.nodes[cur].children.insert(b, next);
48 }
49 cur = ac.nodes[cur].children[&b];
50 }
51 ac.nodes[cur].output.push(id);
52}
53
54pub fn ac_build(ac: &mut AhoCorasick) {
56 let mut queue = VecDeque::new();
57 let root_children: Vec<(u8, usize)> =
59 ac.nodes[0].children.iter().map(|(&b, &v)| (b, v)).collect();
60 for (_, child) in &root_children {
61 ac.nodes[*child].fail = 0;
62 queue.push_back(*child);
63 }
64 while let Some(u) = queue.pop_front() {
65 let children: Vec<(u8, usize)> =
66 ac.nodes[u].children.iter().map(|(&b, &v)| (b, v)).collect();
67 for (b, v) in children {
68 let mut f = ac.nodes[u].fail;
69 loop {
70 if ac.nodes[f].children.contains_key(&b) {
71 f = ac.nodes[f].children[&b];
72 break;
73 }
74 if f == 0 {
75 break;
76 }
77 f = ac.nodes[f].fail;
78 }
79 if f == v {
81 f = 0;
82 }
83 ac.nodes[v].fail = f;
84 let fail_out = ac.nodes[f].output.clone();
85 ac.nodes[v].output.extend(fail_out);
86 queue.push_back(v);
87 }
88 }
89}
90
91#[derive(Debug, Clone, Copy, PartialEq, Eq)]
93pub struct AcMatch {
94 pub start: usize,
95 pub pattern_id: usize,
96}
97
98pub fn ac_search(ac: &AhoCorasick, text: &[u8]) -> Vec<AcMatch> {
100 let mut cur = 0;
101 let mut results = Vec::new();
102
103 for (i, &b) in text.iter().enumerate() {
104 loop {
106 if ac.nodes[cur].children.contains_key(&b) {
107 cur = ac.nodes[cur].children[&b];
108 break;
109 }
110 if cur == 0 {
111 break;
112 }
113 cur = ac.nodes[cur].fail;
114 }
115 for &pid in &ac.nodes[cur].output {
116 let pat_len = ac.patterns[pid].len();
117 let start = i + 1 - pat_len;
118 results.push(AcMatch {
119 start,
120 pattern_id: pid,
121 });
122 }
123 }
124 results
125}
126
127pub fn ac_pattern_count(ac: &AhoCorasick) -> usize {
129 ac.patterns.len()
130}
131
132pub fn ac_contains(ac: &AhoCorasick, text: &[u8]) -> bool {
134 !ac_search(ac, text).is_empty()
135}
136
137#[cfg(test)]
138mod tests {
139 use super::*;
140
141 fn build(patterns: &[&str]) -> AhoCorasick {
142 let mut ac = new_aho_corasick();
143 for p in patterns {
144 ac_add_pattern(&mut ac, p.as_bytes());
145 }
146 ac_build(&mut ac);
147 ac
148 }
149
150 #[test]
151 fn test_single_pattern_found() {
152 let ac = build(&["he"]);
153 let matches = ac_search(&ac, b"ahebcd");
154 assert!(!matches.is_empty());
155 assert_eq!(matches[0].start, 1);
156 }
157
158 #[test]
159 fn test_multiple_patterns() {
160 let ac = build(&["he", "she", "his", "hers"]);
161 let matches = ac_search(&ac, b"ushers");
162 assert!(!matches.is_empty());
163 }
164
165 #[test]
166 fn test_no_match() {
167 let ac = build(&["xyz"]);
168 let matches = ac_search(&ac, b"hello world");
169 assert!(matches.is_empty());
170 }
171
172 #[test]
173 fn test_pattern_count() {
174 let ac = build(&["a", "b", "c"]);
175 assert_eq!(ac_pattern_count(&ac), 3);
176 }
177
178 #[test]
179 fn test_contains_true() {
180 let ac = build(&["world"]);
181 assert!(ac_contains(&ac, b"hello world"));
182 }
183
184 #[test]
185 fn test_contains_false() {
186 let ac = build(&["moon"]);
187 assert!(!ac_contains(&ac, b"hello world"));
188 }
189
190 #[test]
191 fn test_overlapping_patterns() {
192 let ac = build(&["a", "aa"]);
194 let m = ac_search(&ac, b"aaa");
195 assert!(m.len() >= 3); }
197
198 #[test]
199 fn test_empty_text() {
200 let ac = build(&["abc"]);
201 assert!(ac_search(&ac, b"").is_empty());
202 }
203
204 #[test]
205 fn test_pattern_at_end() {
206 let ac = build(&["end"]);
207 let m = ac_search(&ac, b"the end");
208 assert!(!m.is_empty());
209 assert_eq!(m[0].start, 4);
210 }
211}