1#![doc(hidden)]
2
3use itertools::iproduct;
4use regex_syntax::hir::{self, Hir, HirKind, Visitor, visit};
5use std::cell::Cell;
6use std::fmt::{Display, Formatter, Write};
7use std::str::Utf8Error;
8use std::{collections::BTreeSet, ops::Deref};
9
10const EXACT_CROSS_LIMIT: usize = 16;
11
12#[derive(Clone, Debug)]
13pub enum Model {
14 All(Cell<usize>),
16 None(Cell<usize>),
18 Atom(Cell<usize>, String),
20 And(Cell<usize>, Vec<Model>),
22 Or(Cell<usize>, Vec<Model>),
24}
25use Model::{All, And, Atom, None, Or};
26
27impl std::hash::Hash for Model {
28 fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
29 state.write_u8(self.op());
30 match self {
31 All(_) | None(_) => (),
32 Atom(_, s) => s.hash(state),
33 And(_, ps) | Or(_, ps) => {
34 state.write_usize(ps.len());
35 for p in ps {
36 state.write_usize(p.unique_id());
37 }
38 }
39 }
40 }
41}
42
43impl std::cmp::PartialEq for Model {
44 fn eq(&self, other: &Self) -> bool {
45 match (self, other) {
46 (All(_), All(_)) | (None(_), None(_)) => true,
47 (Atom(_, a), Atom(_, b)) => a == b,
48 (And(_, va), And(_, vb)) | (Or(_, va), Or(_, vb)) => {
49 va.len() == vb.len()
50 && std::iter::zip(va, vb).all(|(a, b)| a.unique_id() == b.unique_id())
51 }
52 _ => false,
53 }
54 }
55}
56impl Eq for Model {}
57
58impl From<String> for Model {
59 fn from(s: String) -> Self {
60 Atom(Cell::new(usize::MAX), s)
61 }
62}
63
64impl Display for Model {
65 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
66 match &self {
67 All(_) => f.write_str(""),
68 None(_) => f.write_str("*no-matches*"),
69 Atom(_, s) => f.write_str(s),
70 And(_, subs) => {
71 for (i, s) in subs.iter().enumerate() {
72 if i != 0 {
73 f.write_char(' ')?;
74 }
75 write!(f, "{s}")?;
76 }
77 Ok(())
78 }
79 Or(_, subs) => {
80 f.write_char('(')?;
81 for (i, s) in subs.iter().enumerate() {
82 if i != 0 {
83 f.write_char('|')?;
84 }
85 write!(f, "{s}")?;
86 }
87 f.write_char(')')
88 }
89 }
90 }
91}
92
93#[derive(Debug)]
95pub enum Error {
96 FinalizationError,
98 EarlyStop,
100 DecodeError(Utf8Error),
102 ClassError(hir::ClassBytes),
104}
105impl Display for Error {
106 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
107 write!(f, "{self:?}")
108 }
109}
110impl std::error::Error for Error {}
111impl From<Utf8Error> for Error {
112 fn from(value: Utf8Error) -> Self {
113 Error::DecodeError(value)
114 }
115}
116
117impl Model {
118 pub fn new(r: &Hir) -> Result<Self, Error> {
119 visit(r, InfoVisitor::default())
120 }
121
122 pub fn unique_id(&self) -> usize {
123 match self {
124 All(id) | None(id) | Atom(id, _) | And(id, _) | Or(id, _) => id.get(),
125 }
126 }
127 pub fn set_unique_id(&self, value: usize) {
128 match self {
129 All(id) | None(id) | Atom(id, _) | And(id, _) | Or(id, _) => id.set(value),
130 }
131 }
132
133 pub fn all() -> Self {
134 All(Cell::new(usize::MAX))
135 }
136
137 pub fn none() -> Self {
138 None(Cell::new(usize::MAX))
139 }
140
141 fn or_strings(strings: SSet) -> Self {
142 Model::Or(
143 Cell::new(usize::MAX),
144 simplify_string_set(strings).map(From::from).collect(),
145 )
146 }
147
148 fn op(&self) -> u8 {
149 match self {
150 All(_) => 0,
151 None(_) => 1,
152 Atom(_, _) => 2,
153 And(_, _) => 3,
154 Or(_, _) => 4,
155 }
156 }
157
158 fn simplify(self) -> Self {
160 match self {
161 And(uid, v) if v.is_empty() => All(uid),
162 Or(uid, v) if v.is_empty() => None(uid),
163 And(_, mut v) | Or(_, mut v) if v.len() == 1 => {
164 v.pop().expect("we checked the length").simplify()
165 }
166 s => s,
167 }
168 }
169
170 fn and(self, mut b: Self) -> Self {
174 let mut a = self.simplify();
175 b = b.simplify();
176
177 if a.op() > b.op() {
179 std::mem::swap(&mut a, &mut b);
180 }
181
182 a = match a {
184 All(..) => return b,
186 None(uid) => return None(uid),
188 a => a,
189 };
190
191 match (a, b) {
192 (And(unique_id, mut va), And(_, vb)) => {
194 va.extend(vb);
195 And(unique_id, va)
196 }
197 (And(unique_id, mut v), vv) | (vv, And(unique_id, mut v)) => {
199 v.push(vv);
200 And(unique_id, v)
201 }
202 (a, b) => And(Cell::new(usize::MAX), vec![a, b]),
203 }
204 }
205
206 fn or(self, mut b: Self) -> Self {
207 let mut a = self.simplify();
208 b = b.simplify();
209
210 if a.op() > b.op() {
212 std::mem::swap(&mut a, &mut b);
213 }
214
215 a = match a {
216 None(..) => return b,
218 All(uid) => return All(uid),
220 a => a,
221 };
222
223 match (a, b) {
224 (Or(unique_id, mut va), Or(_, vb)) => {
226 va.extend(vb);
227 Or(unique_id, va)
228 }
229 (Or(unique_id, mut v), vv) | (vv, Or(unique_id, mut v)) => {
231 v.push(vv);
232 Or(unique_id, v)
233 }
234 (a, b) => Or(Cell::new(usize::MAX), vec![a, b]),
235 }
236 }
237}
238
239#[derive(PartialEq, Eq, Debug, Clone)]
250struct LengthThenLex(pub String);
251impl Deref for LengthThenLex {
252 type Target = String;
253
254 fn deref(&self) -> &Self::Target {
255 &self.0
256 }
257}
258impl Ord for LengthThenLex {
259 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
260 self.0
261 .len()
262 .cmp(&other.0.len())
263 .then_with(|| self.0.cmp(&other.0))
264 }
265}
266impl PartialOrd for LengthThenLex {
267 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
268 Some(self.cmp(other))
269 }
270}
271type SSet = BTreeSet<LengthThenLex>;
272fn simplify_string_set(strings: SSet) -> impl Iterator<Item = String> {
273 let mut to_keep = vec![true; strings.len()];
274 let mut e = strings.iter().enumerate();
275 while let Some((i, s)) = e.next() {
276 if s.is_empty() || !to_keep[i] {
277 continue;
278 }
279
280 for (keep, (_, s2)) in to_keep[i..].iter_mut().skip(1).zip(e.clone()) {
281 if *keep && s2.len() > s.len() && s2.0.contains(&s.0) {
282 *keep = false;
283 }
284 }
285 }
286
287 std::iter::zip(to_keep, strings)
288 .filter(|v| v.0)
289 .map(|v| v.1.0)
290}
291
292#[derive(Debug)]
295enum Info {
296 Match(Model),
297 Exact(SSet),
298}
299impl Info {
300 fn take_match(self) -> Model {
301 match self {
302 Self::Match(p) => p,
303 Self::Exact(s) => Model::or_strings(s),
304 }
305 }
306
307 fn into_exact(self) -> Option<SSet> {
308 match self {
309 Self::Exact(s) => Some(s),
310 Self::Match(_) => Option::None,
311 }
312 }
313}
314
315struct InfoVisitor {
316 stack: Vec<Info>,
317 max_visits: usize,
318}
319impl Default for InfoVisitor {
320 fn default() -> Self {
321 Self {
322 max_visits: 100_000,
323 stack: Vec::new(),
324 }
325 }
326}
327
328impl Visitor for InfoVisitor {
332 type Output = Model;
333 type Err = Error;
334
335 fn finish(mut self) -> Result<Self::Output, Self::Err> {
336 (self.stack.len() == 1)
337 .then_some(&mut self.stack)
338 .and_then(|s| s.pop())
339 .map(Info::take_match)
340 .ok_or(Error::FinalizationError)
341 }
342
343 fn visit_pre(&mut self, _hir: &Hir) -> Result<(), Self::Err> {
344 self.max_visits = self.max_visits.checked_sub(1).ok_or(Error::EarlyStop)?;
348
349 Ok(())
350 }
351
352 fn visit_post(&mut self, hir: &Hir) -> Result<(), Self::Err> {
353 match hir.kind() {
354 HirKind::Empty | HirKind::Look(_) => {
355 self.stack
356 .push(Info::Exact([LengthThenLex(String::new())].into()));
357 }
358 HirKind::Literal(hir::Literal(data)) => {
359 if data.is_empty() {
360 self.stack.push(Info::Match(Model::none()));
362 } else {
363 self.stack.push(Info::Exact(
368 [LengthThenLex(
369 std::str::from_utf8(data)?.to_ascii_lowercase(),
370 )]
371 .into(),
372 ));
373 }
374 }
375 HirKind::Class(cls) => {
376 let uc;
377 let c = match cls {
378 hir::Class::Unicode(c) => c,
379 hir::Class::Bytes(b) => {
380 uc = b
381 .to_unicode_class()
382 .ok_or_else(|| Error::ClassError(b.clone()))?;
383 &uc
384 }
385 };
386 self.stack
387 .push(if c.iter().map(|r| r.len()).sum::<usize>() > 10 {
388 Info::Match(Model::all())
389 } else {
390 Info::Exact(
391 c.iter()
392 .flat_map(|r| r.start()..=r.end())
393 .map(|c| c.to_ascii_lowercase())
394 .map(String::from)
395 .map(LengthThenLex)
396 .collect(),
397 )
398 });
399 }
400 HirKind::Repetition(hir::Repetition { min, max, .. }) => {
404 match min {
405 0 => {
406 self.stack.pop();
407 self.stack.push(Info::Match(Model::all()));
408 }
409 &min => {
410 let arg = self
411 .stack
412 .pop()
413 .expect("a repetition to be associated with a pattern to repeat");
414 match arg {
415 Info::Exact(mut arg) if arg.len() == 1 => {
416 let s = arg.pop_first().unwrap();
417 let minsize = min as usize;
418 if Some(min) == *max && (minsize * s.len() < 2048) {
424 let set = [LengthThenLex(s.repeat(minsize))].into();
425 self.stack.push(Info::Exact(set));
426 } else {
427 let min = (2048 / s.len()).clamp(1, minsize);
428 let set = [LengthThenLex(s.repeat(min))].into();
429 self.stack.push(Info::Match(Model::or_strings(set)));
430 }
431 }
432 Info::Exact(arg) if arg.len().pow(min) <= EXACT_CROSS_LIMIT => {
437 let mut acc = arg.clone();
438 for _ in 1..min {
439 acc = iproduct!(&acc, &arg)
440 .map(|(s, ss)| {
441 let mut r = String::with_capacity(s.len() + ss.len());
442 r.push_str(s);
443 r.push_str(ss);
444 LengthThenLex(r)
445 })
446 .collect();
447 }
448 if Some(min) == *max {
449 self.stack.push(Info::Exact(acc));
450 } else {
451 self.stack.push(Info::Match(Model::or_strings(acc)));
452 }
453 }
454 arg => {
455 self.stack.push(Info::Match(arg.take_match()));
456 }
457 }
458 }
459 }
460 }
461 HirKind::Capture(_) => (),
464 HirKind::Alternation(alt) => {
465 let topn = self.stack.len() - alt.len()..;
471 let infos = &mut self.stack[topn.clone()];
472
473 let matches =
474 topn.start + infos.iter().filter(|v| matches!(v, Info::Match(_))).count();
475 infos.sort_unstable_by_key(|v| match v {
479 Info::Match(_) => (false, 0),
480 Info::Exact(s) => (true, s.len()),
481 });
482 let exacts = self
484 .stack
485 .drain(matches..)
486 .rev()
487 .fold(BTreeSet::new(), |mut s, i| {
488 s.append(
489 &mut i
490 .into_exact()
491 .expect("the top `matches` records should be exacts"),
492 );
493 s
494 });
495 let mut matches = self
496 .stack
497 .drain(topn)
498 .map(Info::take_match)
499 .collect::<Vec<_>>();
500 self.stack.push(if matches.is_empty() {
501 Info::Exact(exacts)
502 } else {
503 if !exacts.is_empty() {
504 matches.push(Model::or_strings(exacts));
505 }
506 Info::Match(matches.into_iter().fold(Model::none(), Model::or))
507 });
508 }
509 HirKind::Concat(c) => {
510 let topn = self.stack.len() - c.len()..;
511
512 let mut result = Info::Match(Model::all());
514 let mut resulted = false;
515 let mut exacts = BTreeSet::new();
516 for info in self.stack.drain(topn) {
517 match info {
518 Info::Exact(set) if exacts.is_empty() => {
519 exacts = set;
520 }
521 Info::Exact(set) if exacts.len() == 1 && set.len() == 1 => {
522 let r = exacts.pop_first().expect("exacts to be non-empty").0
523 + set.first().expect("set to be non-empty");
524
525 exacts.insert(LengthThenLex(r));
526 }
527 Info::Exact(set) => {
528 if set.len() * exacts.len() <= EXACT_CROSS_LIMIT {
529 exacts = iproduct!(&exacts, &set)
533 .map(|(s, ss)| {
534 let mut r = String::with_capacity(s.len() + ss.len());
535 r.push_str(s);
536 r.push_str(ss);
537 LengthThenLex(r)
538 })
539 .collect();
540 } else {
541 resulted = true;
542 result = Info::Match(Model::and(
543 result.take_match(),
544 Model::or_strings(exacts),
545 ));
546 exacts = set;
547 }
548 }
549 i => {
550 resulted = true;
551 let mut p = result.take_match();
554 if !exacts.is_empty() {
555 p = Model::and(p, Model::or_strings(std::mem::take(&mut exacts)));
556 }
557 p = Model::and(p, i.take_match());
558 result = Info::Match(p);
559 }
560 }
561 }
562
563 self.stack.push(if exacts.is_empty() {
564 result
565 } else if !resulted {
566 Info::Exact(exacts)
567 } else {
568 Info::Match(Model::and(result.take_match(), Model::or_strings(exacts)))
569 });
570 }
571 }
572 Ok(())
573 }
574}