Skip to main content

fst_regex/
dfa.rs

1use std::collections::{HashMap, HashSet};
2use std::fmt;
3
4use crate::sparse::SparseSet;
5use crate::{Error, Inst};
6
7const STATE_LIMIT: usize = 1_000; // currently at least 2MB >_<
8
9pub struct DfaBuilder {
10    dfa: Dfa,
11    cache: HashMap<Vec<usize>, usize>,
12}
13
14pub struct Dfa {
15    insts: Vec<Inst>,
16    states: Vec<State>,
17}
18
19struct State {
20    insts: Vec<usize>,
21    next: [Option<usize>; 256],
22    is_match: bool,
23}
24
25impl DfaBuilder {
26    pub fn new(insts: Vec<Inst>) -> Self {
27        DfaBuilder {
28            dfa: Dfa { insts: insts, states: Vec::with_capacity(16) },
29            cache: HashMap::with_capacity(1024),
30        }
31    }
32
33    pub fn build(mut self) -> Result<Dfa, Error> {
34        let mut cur = SparseSet::new(self.dfa.insts.len());
35        let mut next = SparseSet::new(self.dfa.insts.len());
36
37        self.dfa.add(&mut cur, 0);
38        let mut states = vec![self.cached_state(&cur).unwrap()];
39        let mut seen = HashSet::new();
40        while let Some(s) = states.pop() {
41            for b in 0..256 {
42                let ns = self.run_state(&mut cur, &mut next, s, b as u8);
43                if let Some(ns) = ns {
44                    if !seen.contains(&ns) {
45                        seen.insert(ns);
46                        states.push(ns);
47                    }
48                }
49                if self.dfa.states.len() > STATE_LIMIT {
50                    return Err(Error::TooManyStates(STATE_LIMIT).into());
51                }
52            }
53        }
54        Ok(self.dfa)
55    }
56
57    fn run_state(
58        &mut self,
59        cur: &mut SparseSet,
60        next: &mut SparseSet,
61        state: usize,
62        byte: u8,
63    ) -> Option<usize> {
64        cur.clear();
65        for &ip in &self.dfa.states[state].insts {
66            cur.add(ip);
67        }
68        self.dfa.run(cur, next, byte);
69        let next_state = self.cached_state(next);
70        self.dfa.states[state].next[byte as usize] = next_state;
71        next_state
72    }
73
74    fn cached_state(&mut self, set: &SparseSet) -> Option<usize> {
75        use std::collections::hash_map::Entry;
76        use crate::Inst::*;
77
78        // There are probably many ways to optimize this routine. ---AG
79
80        let mut insts = vec![];
81        let mut is_match = false;
82        for i in 0..set.len() {
83            let ip = set.get(i);
84            match self.dfa.insts[ip] {
85                Jump(_) | Split(_, _) => {}
86                Range(_, _) => insts.push(ip),
87                Match => {
88                    is_match = true;
89                    insts.push(ip);
90                }
91            }
92        }
93        if insts.len() == 0 {
94            return None;
95        }
96        Some(match self.cache.entry(insts.clone()) {
97            Entry::Occupied(v) => *v.get(),
98            Entry::Vacant(v) => {
99                self.dfa.states.push(State {
100                    insts: insts,
101                    next: [None; 256],
102                    is_match: is_match,
103                });
104                *v.insert(self.dfa.states.len() - 1)
105            }
106        })
107    }
108}
109
110impl Dfa {
111    pub fn is_match(&self, si: usize) -> bool {
112        self.states[si].is_match
113    }
114
115    pub fn accept(&self, si: usize, byte: u8) -> Option<usize> {
116        self.states[si].next[byte as usize]
117    }
118
119    fn add(&self, set: &mut SparseSet, ip: usize) {
120        use crate::Inst::*;
121
122        if set.contains(ip) {
123            return;
124        }
125        set.add(ip);
126        match self.insts[ip] {
127            Match | Range(_, _) => {}
128            Jump(ip) => self.add(set, ip),
129            Split(ip1, ip2) => {
130                self.add(set, ip1);
131                self.add(set, ip2);
132            }
133        }
134    }
135
136    fn run(&self, from: &SparseSet, to: &mut SparseSet, byte: u8) -> bool {
137        use crate::Inst::*;
138        to.clear();
139        let mut is_match = false;
140        for i in 0..from.len() {
141            let ip = from.get(i);
142            match self.insts[ip] {
143                Jump(_) | Split(_, _) => {}
144                Match => is_match = true,
145                Range(s, e) => {
146                    if s <= byte && byte <= e {
147                        self.add(to, ip + 1);
148                    }
149                }
150            }
151        }
152        is_match
153    }
154}
155
156impl fmt::Debug for Dfa {
157    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
158        for (i, inst) in self.insts.iter().enumerate() {
159            writeln!(f, "{:03} {:?}", i, inst)?;
160        }
161        writeln!(f, "------------")?;
162        for (i, state) in self.states.iter().enumerate() {
163            if state.is_match {
164                writeln!(f, "{:03}* {:?}", i, state.insts)?;
165            } else {
166                writeln!(f, "{:03}  {:?}", i, state.insts)?;
167            }
168            for j in 0..256 {
169                if let Some(si) = state.next[j] {
170                    writeln!(f, "{:03}   {:X} => {}", i, j, si)?;
171                }
172            }
173        }
174        Ok(())
175    }
176}