kalosm_sample/structured_parser/
regex.rs

1use std::{collections::HashMap, sync::RwLock};
2
3use crate::{CreateParserState, Parser};
4use regex_automata::{
5    dfa::{dense, Automaton},
6    util::primitives::StateID,
7};
8
9/// A parser that uses a regex pattern to parse input.
10pub struct RegexParser {
11    dfa: dense::DFA<Vec<u32>>,
12    config: regex_automata::util::start::Config,
13    // A cache for the required next bytes for each state
14    jump_table: RwLock<HashMap<StateID, String>>,
15}
16
17impl RegexParser {
18    /// Create a new `RegexParser` from a regex pattern.
19    #[allow(clippy::result_large_err)]
20    pub fn new(regex: &str) -> std::result::Result<Self, regex_automata::dfa::dense::BuildError> {
21        let dfa = dense::DFA::new(regex)?;
22
23        let config =
24            regex_automata::util::start::Config::new().anchored(regex_automata::Anchored::Yes);
25
26        Ok(Self {
27            dfa,
28            config,
29            jump_table: Default::default(),
30        })
31    }
32}
33
34impl CreateParserState for RegexParser {
35    fn create_parser_state(&self) -> <Self as Parser>::PartialState {
36        let start_state = self.dfa.start_state(&self.config).unwrap();
37        RegexParserState {
38            state: start_state,
39            value: Vec::new(),
40        }
41    }
42}
43
44impl Parser for RegexParser {
45    type Output = String;
46    type PartialState = RegexParserState;
47
48    fn parse<'a>(
49        &self,
50        state: &Self::PartialState,
51        input: &'a [u8],
52    ) -> crate::ParseResult<crate::ParseStatus<'a, Self::PartialState, Self::Output>> {
53        let mut state = state.clone();
54        let mut iter = input.iter();
55        while let Some(&b) = iter.next() {
56            state.state = self.dfa.next_state(state.state, b);
57            state.value.push(b);
58            // If this is a match state, accept it only if it matches the whole regex
59            let finish_state = self.dfa.next_eoi_state(state.state);
60            if self.dfa.is_match_state(finish_state) {
61                return Ok(crate::ParseStatus::Finished {
62                    result: String::from_utf8_lossy(&state.value).to_string(),
63                    remaining: iter.as_slice(),
64                });
65            } else if self.dfa.is_dead_state(state.state) || self.dfa.is_quit_state(state.state) {
66                crate::bail!(regex_automata::MatchError::quit(b, 0));
67            }
68        }
69
70        let mut required_next = String::new();
71        let mut required_next_state = state.state;
72        let jump_table_read = self.jump_table.read().unwrap();
73
74        if let Some(string) = jump_table_read.get(&required_next_state) {
75            required_next.push_str(string);
76        } else {
77            'required_next: loop {
78                let mut one_valid_byte = None;
79
80                if let Some(string) = jump_table_read.get(&required_next_state) {
81                    required_next.push_str(string);
82                    break;
83                }
84
85                for byte in 0..=255 {
86                    let next_state = self.dfa.next_state(required_next_state, byte);
87                    if self.dfa.is_dead_state(next_state) || self.dfa.is_quit_state(next_state) {
88                        continue;
89                    }
90                    if one_valid_byte.is_some() {
91                        break 'required_next;
92                    }
93                    one_valid_byte = Some((byte, next_state));
94                }
95
96                if let Some((byte, new_state)) = one_valid_byte {
97                    required_next.push(byte.into());
98                    required_next_state = new_state;
99                } else {
100                    break;
101                }
102            }
103
104            if !required_next.is_empty() {
105                drop(jump_table_read);
106                self.jump_table
107                    .write()
108                    .unwrap()
109                    .insert(state.state, required_next.clone());
110            }
111        }
112
113        Ok(crate::ParseStatus::Incomplete {
114            new_state: state,
115            required_next: required_next.into(),
116        })
117    }
118}
119
120/// The state of a regex parser.
121#[derive(Default, Debug, PartialEq, Eq, Clone)]
122pub struct RegexParserState {
123    state: StateID,
124    value: Vec<u8>,
125}
126
127#[test]
128fn parse_regex() {
129    use crate::ParseStatus;
130
131    let regex = r#"\"\w+\""#;
132    let parser = RegexParser::new(regex).unwrap();
133    let state = parser.create_parser_state();
134    let result = parser.parse(&state, b"\"hello\"world").unwrap();
135    assert_eq!(
136        result,
137        ParseStatus::Finished {
138            result: "\"hello\"".to_string(),
139            remaining: b"world"
140        }
141    );
142
143    let result = parser.parse(&state, b"\"hello world\"");
144    assert!(result.is_err(),);
145
146    let result = parser.parse(&state, b"\"hel").unwrap();
147    match result {
148        ParseStatus::Incomplete {
149            new_state,
150            required_next,
151        } => {
152            assert_eq!(new_state.value, b"\"hel");
153            assert!(required_next.is_empty());
154        }
155        _ => panic!("unexpected result to be incomplete: {result:?}"),
156    }
157}
158
159#[test]
160fn required_next_regex() {
161    use crate::ParseStatus;
162
163    let regex = r#"\{ name: "\w+", description: "[\w ]+" \}"#;
164    let parser = RegexParser::new(regex).unwrap();
165    let start_state = parser
166        .dfa
167        .start_state(
168            &regex_automata::util::start::Config::new().anchored(regex_automata::Anchored::Yes),
169        )
170        .unwrap();
171    let state = parser.create_parser_state();
172    let result = parser.parse(&state, b"").unwrap();
173    assert_eq!(
174        result,
175        ParseStatus::Incomplete {
176            new_state: RegexParserState {
177                state: start_state,
178                value: b"".to_vec(),
179            },
180            required_next: "{ name: \"".into(),
181        }
182    );
183
184    let result = parser.parse(&state, b"{ name: \"hello\"").unwrap();
185
186    match result {
187        ParseStatus::Incomplete {
188            new_state,
189            required_next,
190        } => {
191            assert_eq!(new_state.value, b"{ name: \"hello\"");
192            assert_eq!(required_next, ", description: \"");
193        }
194        _ => panic!("unexpected result to be incomplete: {result:?}"),
195    }
196}