1use std::collections::{HashMap, HashSet};
2use std::iter;
3use std::sync::atomic::{AtomicU64, Ordering};
4
5use crate::helpers::{TrailError, consume_lexeme, format_error, get_env, is_escaped, peek};
6use crate::regex::rex_parse;
7
8enum SplitMarker {
9 QUOTE { index: usize },
10 SLASH { index: usize },
11 GROUP { index: usize },
12}
13
14pub fn split_definition_into_lexemes(definition: &str) -> Result<Vec<String>, TrailError> {
15 let quantifiers: HashMap<char, (Vec<&str>, Vec<&str>)> = [
16 ('?', (vec!["["], vec!["]"])),
17 ('+', (vec!["{"], vec!["}"])),
18 ('*', (vec!["{", "["], vec!["]", "}"])),
19 ]
20 .into_iter()
21 .collect();
22 let delimiters: Vec<char> = vec!['(', ')', '[', ']', '{', '}', '|'];
23
24 let mut lexemes: Vec<String> = Vec::new();
25 let mut in_quote = false;
26 let mut in_slash = false;
27 let mut markers: Vec<SplitMarker> = Vec::new();
28 let mut accumulate: Vec<char> = Vec::new();
29 let mut i = 0;
30
31 let chars: Vec<char> = definition.chars().collect();
32
33 while i < chars.len() {
34 let curr = chars[i];
35
36 if curr == '/' && !in_quote {
38 if !in_slash {
39 consume_lexeme(&mut lexemes, &mut accumulate);
40 accumulate.push(curr);
41 in_slash = true;
42 markers.push(SplitMarker::SLASH { index: i });
43 } else if !is_escaped(&chars, i) {
44 accumulate.push(curr);
45
46 let input: String = accumulate[1..accumulate.len() - 1].iter().collect();
47 let regex = rex_parse(&input)?;
48
49 lexemes.extend(
50 iter::once(String::from("("))
51 .chain(regex)
52 .chain(iter::once(String::from(")"))),
53 );
54 accumulate.clear();
55
56 let marker = markers
57 .pop()
58 .expect("Expected a `SplitMarker`, found `None` instead.");
59 assert!(
60 matches!(marker, SplitMarker::SLASH { .. }),
61 "`MarkerKind.SLASH` was not found."
62 );
63 in_slash = false;
64 }
65 }
66 else if curr == '|' && !in_quote && !in_slash {
68 consume_lexeme(&mut lexemes, &mut accumulate);
69 lexemes.push(curr.to_string());
70 }
71 else if curr == '"' && !in_slash {
73 if !in_quote {
74 consume_lexeme(&mut lexemes, &mut accumulate);
75 accumulate.push(curr);
76
77 markers.push(SplitMarker::QUOTE { index: i });
78 in_quote = true;
79 } else if is_escaped(&chars, i) {
80 accumulate.pop();
81 accumulate.push(curr);
82 } else {
83 accumulate.push(curr);
84 consume_lexeme(&mut lexemes, &mut accumulate);
85
86 let marker = markers
87 .pop()
88 .expect("Expected a `SplitMarker`, found `None` instead.");
89 assert!(
90 matches!(marker, SplitMarker::QUOTE { .. }),
91 "`MarkerKind.QUOTE` was not found."
92 );
93 in_quote = false;
94 }
95 }
96 else if quantifiers.contains_key(&curr) && !in_quote && !in_slash {
98 let (prev, next) = (peek(&chars, i, -1), peek(&chars, i, 1));
99
100 if prev == ')' {
101 let (open_br, close_br) = quantifiers[&curr].clone();
102 let mut assembled: Vec<String> = Vec::new();
103 let mut depth = -1;
104
105 while let Some(last) = lexemes.pop() {
106 assembled.push(last.clone());
107
108 if last != "(" || depth != 0 {
109 depth += match last.as_str() {
110 ")" => 1,
111 "(" => -1,
112 _ => 0,
113 };
114 } else {
115 break;
116 }
117 }
118
119 lexemes.extend(open_br.iter().map(|s| s.to_string()));
120 lexemes.extend(assembled.into_iter().rev());
121 lexemes.extend(close_br.iter().map(|s| s.to_string()));
122 } else if prev == '(' {
123 if curr == '?' && next == '<' {
124 accumulate.push(curr);
125 } else {
126 let context: String = chars[..i - 1].iter().collect();
127 let source = format!("{}{}", prev, curr);
128
129 return Err(TrailError(format_error(
130 "Invalid quantifier precedence.",
131 &context,
132 &source,
133 )));
134 }
135 } else if prev == '\0' {
136 return Err(TrailError(format_error(
137 "Interval quantifiers must be precedented by either an expression or a group.",
138 "",
139 &curr.to_string(),
140 )));
141 } else {
142 let lexeme = if accumulate.is_empty() {
144 lexemes.pop().expect("Vector should not be empty.")
145 } else {
146 accumulate.iter().collect()
147 };
148 accumulate.clear();
149
150 let (open_br, close_br) = quantifiers[&curr].clone();
151
152 lexemes.extend(open_br.iter().map(|s| s.to_string()));
153 lexemes.push(lexeme);
154 lexemes.extend(close_br.iter().map(|s| s.to_string()));
155 }
156 }
157 else if delimiters.contains(&curr) && !in_quote && !in_slash {
159 consume_lexeme(&mut lexemes, &mut accumulate);
160 lexemes.push(curr.to_string());
161
162 match curr {
163 '(' => markers.push(SplitMarker::GROUP { index: i }),
164 ')' => {
165 markers.pop().ok_or({
166 let context: String = chars[..i].iter().collect();
167
168 TrailError(format_error(
169 "Unexpected `)` - no matching opening parenthesis.",
170 &context,
171 ")",
172 ))
173 })?;
174 }
175 '|' => {}
176 _ => {
177 let context: String = chars[..i].iter().collect();
178
179 Err(TrailError(format_error(
180 "Delimiter reserved for internal use.",
181 &context,
182 &curr.to_string(),
183 )))?;
184 }
185 }
186 }
187 else if curr == '<' && !in_quote && !in_slash {
189 let (prevf, prevs) = (peek(&chars, i, -1), peek(&chars, i, -2));
190
191 if prevf == '$' || (prevf == '?' && prevs == '(') {
192 let mut k = 0;
193
194 let mut next = peek(&chars, i, k);
195 while next != '>' {
196 accumulate.push(next);
197 next = peek(&chars, i, k + 1);
198 k += 1;
199 }
200
201 accumulate.push(next);
202
203 consume_lexeme(&mut lexemes, &mut accumulate);
204 i += k as usize;
205 } else {
206 let context: String = chars[..i].iter().collect();
207
208 return Err(TrailError(format_error(
209 "Reference has invalid precedence - `(?<alphanumeric>...)` to capture, and `$<alphanumeric>`.",
210 &context,
211 &curr.to_string(),
212 )));
213 }
214 }
215 else if curr == '\0' {
217 let context: String = chars[..i].iter().collect();
218
219 return Err(TrailError(format_error(
220 "Reserved null character `\\0`.",
221 &context,
222 &curr.to_string(),
223 )));
224 }
225 else if curr == ' ' {
227 if in_quote || in_slash {
228 accumulate.push(curr);
229 } else {
230 consume_lexeme(&mut lexemes, &mut accumulate);
231 }
232 }
233 else {
235 accumulate.push(curr);
236 }
237
238 i += 1;
239 }
240
241 consume_lexeme(&mut lexemes, &mut accumulate);
242
243 if let Some(marker) = markers.pop() {
244 match marker {
245 SplitMarker::QUOTE { index } => {
246 let context: String = chars[..index].iter().collect();
247
248 return Err(TrailError(format_error(
249 "Unclosed string literal - missing \" to terminate the string.",
250 &context,
251 "\"",
252 )));
253 }
254 SplitMarker::SLASH { index } => {
255 let context: String = chars[..index].iter().collect();
256
257 return Err(TrailError(format_error(
258 "Unterminated regex pattern starting with `/` - add closing delimiter or escape `/` as `\\/`.",
259 &context,
260 "/",
261 )));
262 }
263 SplitMarker::GROUP { index } => {
264 let context: String = chars[..index].iter().collect();
265
266 return Err(TrailError(format_error(
267 "Unmatched '(' - expected a closing ')'.",
268 &context,
269 "(",
270 )));
271 }
272 }
273 }
274
275 return Ok(iter::once(String::from("("))
276 .chain(lexemes.into_iter())
277 .chain(iter::once(String::from(")")))
278 .collect());
279}
280
281pub static UUID: AtomicU64 = AtomicU64::new(0);
282
283#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
284pub struct UUId(u64);
285
286impl UUId {
287 pub fn new() -> Self {
288 Self(UUID.fetch_add(1, Ordering::Relaxed))
289 }
290}
291
292#[derive(Debug, Clone, PartialEq, Eq, Hash)]
293pub struct SymbolVar(pub String);
294
295impl SymbolVar {
296 pub fn new(value: &str) -> Self {
297 Self(value.to_string())
298 }
299
300 pub fn rand() -> Self {
301 Self(format!("{}", UUID.fetch_add(1, Ordering::Relaxed)))
302 }
303}
304
305#[derive(Debug, Clone, PartialEq, Eq, Hash)]
306pub struct SymbolTag(pub String);
307
308impl SymbolTag {
309 pub fn new(value: &str) -> Self {
310 Self(value.to_string())
311 }
312}
313
314#[derive(Debug, Clone, PartialEq, Eq, Hash)]
315pub enum Symbol {
316 TERMINAL {
317 id: UUId,
318 value: String,
319 tags: Vec<SymbolTag>,
320 },
321 VARIABLE {
322 id: UUId,
323 value: SymbolVar,
324 },
325 REFERENCE {
326 id: UUId,
327 value: SymbolTag,
328 },
329 END {
330 id: UUId,
331 },
332}
333
334#[derive(Debug, Clone)]
335pub struct SymbolGraph {
336 pub nodes: HashMap<UUId, Symbol>,
337 pub heads: HashSet<UUId>,
338 pub edges: HashMap<UUId, HashSet<UUId>>,
339 pub tails: HashSet<UUId>,
340}
341
342impl Default for SymbolGraph {
343 fn default() -> Self {
344 Self {
345 nodes: HashMap::new(),
346 heads: HashSet::new(),
347 edges: HashMap::new(),
348 tails: HashSet::new(),
349 }
350 }
351}
352
353fn build_symbol_from_lexeme(lexeme: &str) -> (Symbol, UUId) {
354 if lexeme.starts_with("\"") && lexeme.ends_with("\"") {
355 let id = UUId::new();
356 return (
357 Symbol::TERMINAL {
358 id: id,
359 value: lexeme[1..lexeme.len() - 1].to_string(),
360 tags: Vec::new(),
361 },
362 id,
363 );
364 } else if lexeme.starts_with("$<") && lexeme.ends_with(">") {
365 let id = UUId::new();
366 return (
367 Symbol::REFERENCE {
368 id: id,
369 value: SymbolTag(lexeme[2..lexeme.len() - 1].to_string()),
370 },
371 id,
372 );
373 } else {
374 let id = UUId::new();
375 return (
376 Symbol::VARIABLE {
377 id: id,
378 value: SymbolVar(lexeme.to_string()),
379 },
380 id,
381 );
382 }
383}
384
385pub fn construct_symbol_graph(lexemes: &Vec<String>) -> SymbolGraph {
386 let mut graph = SymbolGraph::default();
387
388 if lexemes.is_empty() {
389 return graph;
390 }
391
392 let (pnode, mut pid) = build_symbol_from_lexeme(&lexemes[0]);
393 graph.nodes.insert(pid, pnode);
394 graph.heads.insert(pid);
395 graph.edges.insert(pid, HashSet::new());
396
397 for lexeme in &lexemes[1..] {
398 let (cnode, cid) = build_symbol_from_lexeme(lexeme);
399 graph.nodes.insert(cid, cnode);
400 graph.edges.insert(pid, HashSet::from([cid]));
401 pid = cid;
402 }
403
404 graph.tails.insert(pid);
405
406 return graph;
407}
408
409pub fn connect_symbol_graph(mut graph_lt: SymbolGraph, mut graph_rt: SymbolGraph) -> SymbolGraph {
410 if graph_lt.edges.is_empty() && graph_rt.edges.is_empty() {
411 return SymbolGraph::default();
412 } else if graph_lt.edges.is_empty() {
413 return graph_rt;
414 } else if graph_rt.edges.is_empty() {
415 return graph_lt;
416 } else {
417 graph_lt
418 .edges
419 .retain(|_, v: &mut HashSet<UUId>| !v.is_empty());
420 graph_rt
421 .edges
422 .retain(|_, v: &mut HashSet<UUId>| !v.is_empty());
423
424 let mut edges: HashMap<UUId, HashSet<UUId>> =
425 graph_lt.edges.into_iter().chain(graph_rt.edges).collect();
426
427 if get_env("SKIP_RULE", true) {
428 let (end_symbols_h, end_symbols_t): (Vec<UUId>, Vec<UUId>) = (
429 graph_lt
430 .heads
431 .iter()
432 .filter(|id| matches!(graph_lt.nodes[id], Symbol::END { .. }))
433 .map(|id| *id)
434 .collect(),
435 graph_lt
436 .tails
437 .iter()
438 .filter(|id| matches!(graph_lt.nodes[id], Symbol::END { .. }))
439 .map(|id| *id)
440 .collect(),
441 );
442 let (size_h, size_t) = (end_symbols_h.len(), end_symbols_t.len());
443
444 assert!(size_h <= 1, "Duplicate `END` head symbol.");
445
446 if size_h == 1 {
447 graph_lt.heads.remove(&end_symbols_h[0]);
448 graph_lt.heads.extend(graph_rt.heads.clone());
449 graph_lt.nodes.remove(&end_symbols_h[0]);
450 }
451
452 assert!(size_t <= 1, "Duplicate `END` tail symbol.");
453
454 if size_t == 1 {
455 let predecessors: Vec<UUId> = edges
456 .iter()
457 .filter(|(_, v)| v.contains(&end_symbols_t[0]))
458 .map(|(k, _)| *k)
459 .collect();
460
461 for predecessor in &predecessors {
462 edges
463 .get_mut(&predecessor)
464 .expect("Predecessor should exist.")
465 .remove(&end_symbols_t[0]);
466 }
467
468 graph_lt.tails.remove(&end_symbols_t[0]);
469 graph_lt.tails.extend(predecessors);
470 graph_lt.nodes.remove(&end_symbols_t[0]);
471 }
472 }
473
474 let (heads, nodes, mut tails) = (
475 graph_lt.heads,
476 graph_lt
477 .nodes
478 .into_iter()
479 .chain(graph_rt.nodes)
480 .collect::<HashMap<UUId, Symbol>>(),
481 graph_rt.tails,
482 );
483
484 for tail in &graph_lt.tails {
485 for head in &graph_rt.heads {
486 if matches!(nodes[&head], Symbol::END { .. }) {
487 tails.insert(*head);
488 }
489
490 edges
491 .entry(*tail)
492 .or_insert_with(HashSet::new)
493 .insert(*head);
494 }
495 }
496
497 return SymbolGraph {
498 nodes: nodes,
499 heads: heads,
500 edges: edges,
501 tails: tails,
502 };
503 }
504}
505
506fn union_symbol_graph(graph_lt: SymbolGraph, graph_rt: SymbolGraph) -> SymbolGraph {
507 if graph_lt.edges.is_empty() && graph_rt.edges.is_empty() {
508 return SymbolGraph::default();
509 } else if graph_lt.edges.is_empty() {
510 return graph_rt;
511 } else if graph_rt.edges.is_empty() {
512 return graph_lt;
513 } else {
514 let (heads_lt, mut heads_rt) = (graph_lt.heads, graph_rt.heads);
515 let (tails_lt, mut tails_rt) = (graph_lt.tails, graph_rt.tails);
516 let (nodes, mut edges): (HashMap<UUId, Symbol>, HashMap<UUId, HashSet<UUId>>) = (
517 graph_lt.nodes.into_iter().chain(graph_rt.nodes).collect(),
518 graph_lt.edges.into_iter().chain(graph_rt.edges).collect(),
519 );
520
521 let (end_symbols_hlt, end_symbols_hrt): (Vec<UUId>, Vec<UUId>) = (
523 heads_lt
524 .iter()
525 .filter(|x| matches!(nodes[x], Symbol::END { .. }))
526 .copied()
527 .collect(),
528 heads_rt
529 .iter()
530 .filter(|x| matches!(nodes[x], Symbol::END { .. }))
531 .copied()
532 .collect(),
533 );
534 let (size_hlt, size_hrt) = (end_symbols_hlt.len(), end_symbols_hrt.len());
535
536 assert!(
537 size_hlt <= 1 && size_hrt <= 1,
538 "Duplicate head `END` symbols."
539 );
540
541 if (size_hlt & size_hrt) == 1 {
542 heads_rt.remove(&end_symbols_hrt[0]);
543 }
544
545 let (end_symbols_tlt, end_symbols_trt): (Vec<UUId>, Vec<UUId>) = (
547 tails_lt
548 .iter()
549 .filter(|x| matches!(nodes[x], Symbol::END { .. }))
550 .copied()
551 .collect(),
552 tails_rt
553 .iter()
554 .filter(|x| matches!(nodes[x], Symbol::END { .. }))
555 .copied()
556 .collect(),
557 );
558 let (size_tlt, size_trt) = (end_symbols_tlt.len(), end_symbols_trt.len());
559
560 assert!(
561 size_tlt <= 1 && size_trt <= 1,
562 "Duplicate tail `END` symbols."
563 );
564
565 if (size_tlt & size_trt) == 1 {
566 let (end_symbol_tlt, end_symbol_trt) = (end_symbols_tlt[0], end_symbols_trt[0]);
567 let predecessors: Vec<UUId> = edges
568 .iter()
569 .filter(|(_, v)| v.contains(&end_symbol_trt))
570 .map(|(k, _)| *k)
571 .collect();
572
573 for predecessor in predecessors {
574 let childrens = edges.get_mut(&predecessor).unwrap();
575 childrens.remove(&end_symbol_trt);
576 childrens.insert(end_symbol_tlt);
577 }
578
579 tails_rt.remove(&end_symbol_trt);
580 }
581
582 let (heads, tails): (HashSet<UUId>, HashSet<UUId>) = (
583 heads_lt.into_iter().chain(heads_rt).collect(),
584 tails_lt.into_iter().chain(tails_rt).collect(),
585 );
586
587 return SymbolGraph {
588 nodes: nodes,
589 heads: heads,
590 edges: edges,
591 tails: tails,
592 };
593 }
594}
595
596mod bitflags {
597 #[derive(Debug, PartialEq, Clone, Copy)]
598 pub struct DelimiterProperty(pub u32);
599
600 pub const NULL: DelimiterProperty = DelimiterProperty(0 << 0);
601 pub const STOP: DelimiterProperty = DelimiterProperty(1 << 0);
602 pub const LOOP: DelimiterProperty = DelimiterProperty(1 << 1);
603 pub const PIPE: DelimiterProperty = DelimiterProperty(1 << 2);
604}
605
606fn cast_symbol_graph(graph: &mut SymbolGraph, properties: bitflags::DelimiterProperty) {
607 let (nodes, heads, edges, tails) = (
608 &mut graph.nodes,
609 &mut graph.heads,
610 &mut graph.edges,
611 &mut graph.tails,
612 );
613
614 let (end_symbol_h, end_symbol_t) = (
615 heads
616 .iter()
617 .find_map(|x| (matches!(nodes[x], Symbol::END { .. })).then_some(nodes[x].clone())),
618 tails
619 .iter()
620 .find_map(|x| (matches!(nodes[x], Symbol::END { .. })).then_some(nodes[x].clone())),
621 );
622
623 let end_symbol: Symbol = end_symbol_h
624 .or(end_symbol_t)
625 .unwrap_or(Symbol::END { id: UUId::new() });
626 let end_id = if let Symbol::END { id } = end_symbol {
627 id
628 } else {
629 unreachable!()
630 };
631
632 if (properties.0 & bitflags::LOOP.0) > 0 {
633 if tails.contains(&end_id) {
634 let parents = edges
635 .iter()
636 .filter(|(_, v)| v.contains(&end_id))
637 .map(|(k, _)| *k);
638
639 tails.extend(parents);
640 }
641
642 for tail in &*tails {
643 for head in &*heads {
644 if !matches!(nodes[tail], Symbol::END { .. }) {
645 edges
646 .entry(*tail)
647 .or_insert_with(HashSet::new)
648 .insert(*head);
649 }
650 }
651 if !matches!(nodes[tail], Symbol::END { .. }) {
652 edges
653 .entry(*tail)
654 .or_insert_with(HashSet::new)
655 .insert(end_id);
656 }
657 }
658
659 *tails = HashSet::from([end_id]);
660 nodes.insert(end_id, end_symbol);
661 } else if (properties.0 & bitflags::STOP.0) > 0 {
662 heads.insert(end_id);
663 nodes.insert(end_id, end_symbol);
664 edges.insert(end_id, HashSet::new());
665 }
666}
667
668#[derive(Clone)]
669struct TrailAccumulator {
670 kind: bitflags::DelimiterProperty,
671 graph: SymbolGraph,
672 tag: String,
673}
674
675impl Default for TrailAccumulator {
676 fn default() -> Self {
677 Self {
678 kind: bitflags::NULL,
679 graph: SymbolGraph::default(),
680 tag: String::new(),
681 }
682 }
683}
684
685pub fn build_symbol_graph(definition: &str) -> Result<SymbolGraph, TrailError> {
686 let bitflags: HashMap<String, bitflags::DelimiterProperty> = HashMap::from([
687 ("(".to_string(), bitflags::NULL),
688 ("[".to_string(), bitflags::STOP),
689 ("{".to_string(), bitflags::LOOP),
690 ("|".to_string(), bitflags::PIPE),
691 ]);
692
693 let lexemes = split_definition_into_lexemes(definition)?;
694
695 let (mut state, mut accumulated): (Vec<TrailAccumulator>, Vec<String>) =
696 (vec![TrailAccumulator::default()], Vec::new());
697 let (mut i, l_size) = (0, lexemes.len());
698
699 while i < l_size {
700 let (lexeme, s_size) = (&lexemes[i], state.len());
701
702 if matches!(lexeme.as_str(), "(" | "[" | "{" | "|") {
703 let next_graph = construct_symbol_graph(&accumulated);
704 state[s_size - 1].graph =
705 connect_symbol_graph(state[s_size - 1].graph.clone(), next_graph);
706 state.push(TrailAccumulator {
707 kind: bitflags[lexeme],
708 ..TrailAccumulator::default()
709 });
710 accumulated.clear();
711 } else if lexeme.starts_with("?<") && lexeme.ends_with(">") {
712 state[s_size - 1].tag = lexeme[2..lexeme.len() - 1].to_string();
713 } else if matches!(lexeme.as_str(), ")" | "]" | "}") {
714 let next_graph = construct_symbol_graph(&accumulated);
715 state[s_size - 1].graph =
716 connect_symbol_graph(state[s_size - 1].graph.clone(), next_graph);
717 accumulated.clear();
718
719 let mut accumulator = state.pop().expect("Build state should not be empty.");
720
721 while accumulator.kind == bitflags::PIPE {
722 let graph = &mut state.last_mut().unwrap().graph;
723 *graph = union_symbol_graph(graph.clone(), accumulator.graph);
724 accumulator = state.pop().expect("Build state should not be empty.");
725 }
726
727 cast_symbol_graph(&mut accumulator.graph, accumulator.kind);
728
729 let tag = accumulator.tag;
730 for symbol in accumulator.graph.nodes.values_mut() {
731 if let Symbol::TERMINAL { tags, .. } = symbol
732 && !tag.is_empty()
733 {
734 tags.push(SymbolTag(tag.clone()))
735 }
736 }
737
738 let s_size = state.len();
739 state[s_size - 1].graph =
740 connect_symbol_graph(state[s_size - 1].graph.clone(), accumulator.graph);
741 } else {
742 accumulated.push(lexeme.to_string());
743 }
744
745 i += 1;
746 }
747
748 assert_eq!(state.len(), 1, "Only one TrailAccumulator should remain.");
749
750 return Ok(state.pop().unwrap().graph);
751}