kalosm_sample/structured_parser/
regex.rs1use std::{collections::HashMap, sync::RwLock};
2
3use crate::{CreateParserState, Parser};
4use regex_automata::{
5 dfa::{dense, Automaton},
6 util::primitives::StateID,
7};
8
9pub struct RegexParser {
11 dfa: dense::DFA<Vec<u32>>,
12 config: regex_automata::util::start::Config,
13 jump_table: RwLock<HashMap<StateID, String>>,
15}
16
17impl RegexParser {
18 #[allow(clippy::result_large_err)]
20 pub fn new(regex: &str) -> std::result::Result<Self, regex_automata::dfa::dense::BuildError> {
21 let dfa = dense::DFA::new(regex)?;
22
23 let config =
24 regex_automata::util::start::Config::new().anchored(regex_automata::Anchored::Yes);
25
26 Ok(Self {
27 dfa,
28 config,
29 jump_table: Default::default(),
30 })
31 }
32}
33
34impl CreateParserState for RegexParser {
35 fn create_parser_state(&self) -> <Self as Parser>::PartialState {
36 let start_state = self.dfa.start_state(&self.config).unwrap();
37 RegexParserState {
38 state: start_state,
39 value: Vec::new(),
40 }
41 }
42}
43
44impl Parser for RegexParser {
45 type Output = String;
46 type PartialState = RegexParserState;
47
48 fn parse<'a>(
49 &self,
50 state: &Self::PartialState,
51 input: &'a [u8],
52 ) -> crate::ParseResult<crate::ParseStatus<'a, Self::PartialState, Self::Output>> {
53 let mut state = state.clone();
54 let mut iter = input.iter();
55 while let Some(&b) = iter.next() {
56 state.state = self.dfa.next_state(state.state, b);
57 state.value.push(b);
58 let finish_state = self.dfa.next_eoi_state(state.state);
60 if self.dfa.is_match_state(finish_state) {
61 return Ok(crate::ParseStatus::Finished {
62 result: String::from_utf8_lossy(&state.value).to_string(),
63 remaining: iter.as_slice(),
64 });
65 } else if self.dfa.is_dead_state(state.state) || self.dfa.is_quit_state(state.state) {
66 crate::bail!(regex_automata::MatchError::quit(b, 0));
67 }
68 }
69
70 let mut required_next = String::new();
71 let mut required_next_state = state.state;
72 let jump_table_read = self.jump_table.read().unwrap();
73
74 if let Some(string) = jump_table_read.get(&required_next_state) {
75 required_next.push_str(string);
76 } else {
77 'required_next: loop {
78 let mut one_valid_byte = None;
79
80 if let Some(string) = jump_table_read.get(&required_next_state) {
81 required_next.push_str(string);
82 break;
83 }
84
85 for byte in 0..=255 {
86 let next_state = self.dfa.next_state(required_next_state, byte);
87 if self.dfa.is_dead_state(next_state) || self.dfa.is_quit_state(next_state) {
88 continue;
89 }
90 if one_valid_byte.is_some() {
91 break 'required_next;
92 }
93 one_valid_byte = Some((byte, next_state));
94 }
95
96 if let Some((byte, new_state)) = one_valid_byte {
97 required_next.push(byte.into());
98 required_next_state = new_state;
99 } else {
100 break;
101 }
102 }
103
104 if !required_next.is_empty() {
105 drop(jump_table_read);
106 self.jump_table
107 .write()
108 .unwrap()
109 .insert(state.state, required_next.clone());
110 }
111 }
112
113 Ok(crate::ParseStatus::Incomplete {
114 new_state: state,
115 required_next: required_next.into(),
116 })
117 }
118}
119
120#[derive(Default, Debug, PartialEq, Eq, Clone)]
122pub struct RegexParserState {
123 state: StateID,
124 value: Vec<u8>,
125}
126
127#[test]
128fn parse_regex() {
129 use crate::ParseStatus;
130
131 let regex = r#"\"\w+\""#;
132 let parser = RegexParser::new(regex).unwrap();
133 let state = parser.create_parser_state();
134 let result = parser.parse(&state, b"\"hello\"world").unwrap();
135 assert_eq!(
136 result,
137 ParseStatus::Finished {
138 result: "\"hello\"".to_string(),
139 remaining: b"world"
140 }
141 );
142
143 let result = parser.parse(&state, b"\"hello world\"");
144 assert!(result.is_err(),);
145
146 let result = parser.parse(&state, b"\"hel").unwrap();
147 match result {
148 ParseStatus::Incomplete {
149 new_state,
150 required_next,
151 } => {
152 assert_eq!(new_state.value, b"\"hel");
153 assert!(required_next.is_empty());
154 }
155 _ => panic!("unexpected result to be incomplete: {result:?}"),
156 }
157}
158
159#[test]
160fn required_next_regex() {
161 use crate::ParseStatus;
162
163 let regex = r#"\{ name: "\w+", description: "[\w ]+" \}"#;
164 let parser = RegexParser::new(regex).unwrap();
165 let start_state = parser
166 .dfa
167 .start_state(
168 ®ex_automata::util::start::Config::new().anchored(regex_automata::Anchored::Yes),
169 )
170 .unwrap();
171 let state = parser.create_parser_state();
172 let result = parser.parse(&state, b"").unwrap();
173 assert_eq!(
174 result,
175 ParseStatus::Incomplete {
176 new_state: RegexParserState {
177 state: start_state,
178 value: b"".to_vec(),
179 },
180 required_next: "{ name: \"".into(),
181 }
182 );
183
184 let result = parser.parse(&state, b"{ name: \"hello\"").unwrap();
185
186 match result {
187 ParseStatus::Incomplete {
188 new_state,
189 required_next,
190 } => {
191 assert_eq!(new_state.value, b"{ name: \"hello\"");
192 assert_eq!(required_next, ", description: \"");
193 }
194 _ => panic!("unexpected result to be incomplete: {result:?}"),
195 }
196}