1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
//! Extract literal prefixes and suffixes from an `Repr<I>`.
//! <https://en.wikipedia.org/wiki/Brzozowski_derivative>
#[cfg(feature = "quotient")]
mod search;
use alloc::vec::Vec;
use core::{cmp, fmt::Debug, mem, ops::Deref};
use unconst::unconst;
use crate::interval::Interval;
use crate::repr::{
Repr::{self, Add, And, Div, Inf, Mul, One, Or, Sup},
Zero,
};
use crate::seq::Seq;
use crate::traits::Integral;
#[cfg(feature = "quotient")]
pub use search::LiteralSearcher;
/// A set of Seqs extracted from a Repr.
///
/// Every member of the set is a `Literal`, which is represented by a
/// `Seq<I>`. Every member is said to be either *complete* or *cut*. A complete literal means that it extends until the beginning (or end) of the regular expression. In some circumstances, this can be used to indicate a match in the regular expression.
///
/// A key aspect of literal extraction is knowing when to stop. It is not
/// feasible to blindly extract all literals from a regular expression, even if
/// there are finitely many. For example, the regular expression `[0-9]{10}`
/// has `10^10` distinct literals. For this reason, literal extraction is
/// bounded to some low number by default using heuristics, but the limits can
/// be tweaked.
#[unconst]
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Set<I: ~const Integral>(Vec<Literal<I>>);
/*
This limit also applies to case insensitive literals, since each
character in the case insensitive literal is converted to a class, and
then case folded.
*/
#[unconst]
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub struct Literal<I: ~const Integral> {
seq: Seq<I>,
cut: bool,
}
#[unconst]
impl<I: ~const Integral> Set<I> {
/// Returns a new empty set of literals using default limits.
pub fn empty() -> Set<I> {
Set(Vec::new())
}
/// Returns a set of literal prefixes extracted from the given `Repr<I>`.
pub fn prefixes(expr: &Repr<I>) -> Set<I> {
let mut set = Set::empty();
set.union_prefixes(expr);
set
}
/// Returns a set of literal suffixes extracted from the given `Repr<I>`.
pub fn suffixes(expr: &Repr<I>) -> Set<I> {
let mut output = Self::prefixes(&expr.rev());
output.reverse();
output
}
/// Returns the length of the smallest literal.
///
/// Returns None is there are no literals in the set.
pub fn min_len(&self) -> Option<usize> {
let mut min = None;
for lit in &self.0 {
match min {
None => min = Some(lit.seq.len()),
Some(m) if lit.len() < m => min = Some(lit.len()),
_ => {}
}
}
min
}
/// Returns true if all members in this set are complete.
pub fn all_complete(&self) -> bool {
!self.0.is_empty() && self.0.iter().all(|seq| !seq.cut)
}
/// Returns true if any member in this set is complete.
pub fn any_complete(&self) -> bool {
self.0.iter().any(|lit| !lit.cut)
}
/// Returns true if this set contains an empty literal.
pub fn contains_empty(&self) -> bool {
self.0.iter().any(|lit| lit.is_empty())
}
/// Returns true if this set is empty or if all of its members is empty.
pub fn is_empty(&self) -> bool {
self.0.is_empty() || self.0.iter().all(|lit| lit.is_empty())
}
/// Returns the longest common prefix of all members in this set.
pub fn longest_common_prefix(&self) -> &[I] {
if self.is_empty() {
return &[];
}
let lit0 = &*self.0[0];
let mut len = lit0.len();
for lit in &self.0[1..] {
len = cmp::min(
len,
(lit.seq.deref())
.iter()
.zip(*lit0)
.take_while(|(a, b)| a == &b)
.count(),
);
}
&self.0[0][..len]
}
/// Returns the longest common suffix of all members in this set.
pub fn longest_common_suffix(&self) -> &[I] {
if self.is_empty() {
return &[];
}
let lit0 = &*self.0[0];
let mut len = lit0.len();
for lit in &self.0[1..] {
len = cmp::min(
len,
lit.iter()
.rev()
.zip(lit0.iter().rev())
.take_while(|&(a, b)| a == b)
.count(),
);
}
&self.0[0][self.0[0].len() - len..]
}
/// Returns a new set of literals with the given len trimmed
/// from the suffix of each literal.
///
/// If any literal would be cut out completely by trimming, then None is
/// returned.
///
/// Any duplicates that are created as a result of this transformation are
/// removed.
pub fn trim_suffix(&self, len: usize) -> Option<Set<I>> {
if self.min_len().map(|len_| len_ <= len).unwrap_or(true) {
return None;
}
let mut new = Self::empty();
for mut lit in self.0.iter().cloned() {
let new_len = lit.len() - len;
lit.truncate(new_len);
lit.cut = true;
new.0.push(lit);
}
new.0.sort();
new.0.dedup();
Some(new)
}
/// Returns a new set of prefixes of this set of literals that are
/// guaranteed to be unambiguous.
///
/// Any substring match with a member of the set is returned is guaranteed
/// to never overlap with a substring match of another member of the set
/// at the same starting position.
///
/// Given any two members of the returned set, neither is a substring of
/// the other.
pub fn unambiguous_prefixes(&self) -> Set<I> {
if self.0.is_empty() {
return Self::empty();
}
let mut old = self.0.to_vec();
let mut new = Self::empty();
'OUTER: while let Some(mut candidate) = old.pop() {
if candidate.is_empty() {
continue;
}
if new.0.is_empty() {
new.0.push(candidate);
continue;
}
for lit2 in &mut new.0 {
if lit2.is_empty() {
continue;
}
if &candidate == lit2 {
// If the literal is already in the set, then we can
// just drop it. But make sure that cut literals are
// infectious!
candidate.cut = candidate.cut || lit2.cut;
lit2.cut = candidate.cut;
continue 'OUTER;
}
if candidate.len() < lit2.len() {
if let Some(i) = position(&candidate, &lit2) {
candidate.cut = true;
let mut lit3 = lit2.clone();
lit3.truncate(i);
lit3.cut = true;
old.push(lit3);
lit2.clear();
}
} else if let Some(i) = position(&lit2, &candidate) {
lit2.cut = true;
let mut new_candidate = candidate.clone();
new_candidate.truncate(i);
new_candidate.cut = true;
old.push(new_candidate);
candidate.clear();
}
// Oops, the candidate is already represented in the set.
if candidate.is_empty() {
continue 'OUTER;
}
}
new.0.push(candidate);
}
new.0.retain(|lit| !lit.is_empty());
new.0.sort();
new.0.dedup();
new
}
/// Returns a new set of suffixes of this set of literals that are
/// guaranteed to be unambiguous.
///
/// Any substring match with a member of the set is returned is guaranteed
/// to never overlap with a substring match of another member of the set
/// at the same ending position.
///
/// Given any two members of the returned set, neither is a substring of
/// the other.
pub fn unambiguous_suffixes(&self) -> Set<I> {
// This is a touch wasteful...
let mut set = self.clone();
set.reverse();
let mut unamb = set.unambiguous_prefixes();
unamb.reverse();
unamb
}
/// Unions the prefixes from the given expression to this set.
///
/// If prefixes could not be added (for example, the set of prefixes from `expr` includes the empty
/// string), then false is returned.
///
/// Note that prefix literals extracted from `expr` are said to be complete
/// if and only if the literal extends from the beginning of `expr` to the
/// end of `expr`.
pub fn union_prefixes(&mut self, expr: &Repr<I>) {
let mut set = Self::empty();
prefixes(expr, &mut set);
if !set.is_empty() && !set.contains_empty() {
self.union(set);
}
}
/// Unions this set with another set.
pub fn union(&mut self, other: Self) {
if !other.is_empty() {
self.0.extend(other.0);
} else {
// TODO(rnarkk) experiment
// self.0.push(Literal::empty());
panic!("other is empty");
}
}
/// Extends this set with another set.
///
/// The set of literals is extended via a cross product.
pub fn cross_product(&mut self, other: &Self) {
assert!(!other.is_empty());
if other.is_empty() {
return;
}
let mut base = self.remove_complete();
if base.is_empty() {
base = vec![Literal::empty()];
}
for seq in other.0 {
for mut self_lit in base.clone() {
self_lit.seq.mul(seq.seq);
self_lit.cut = seq.cut;
self.0.push(self_lit);
}
}
}
/// Extends each literal in this set with the Seq given.
///
/// If the set is empty, then the given literal is added to the set.
///
/// If a prefix of `bytes` can be fit into this set, then it is used and all
/// resulting literals are cut.
pub fn cross_add(&mut self, seq: &Seq<I>) {
// N.B. This could be implemented by simply calling cross_product with
// a literal set containing just `bytes`, but we can be smarter about
// taking shorter prefixes of `bytes` if they'll fit.
// if bytes.is_empty() {
// return true;
// }
if self.0.is_empty() {
self.0.push(Literal::new(seq.to_owned()));
self.0[0].cut = false;
// TODO(rnarkk)
// return !self.0[0].cut;
}
for seq in &mut self.0 {
if !seq.cut {
seq.extend(*seq);
}
}
}
pub fn add(&mut self, seq: Literal<I>) {
self.0.push(seq);
}
/// Extends each literal in this set with the Interval given, writing the bytes of each character in reverse when `Interval<char>`.
pub fn add_interval(&mut self, interval: &Interval<I>) {
let mut base = self.remove_complete();
if base.is_empty() {
base = vec![Literal::empty()];
}
for c in *interval {
for mut lit in base.clone() {
lit.push(c);
self.0.push(lit);
}
}
}
/// Cuts every member of this set. When a member is cut, it can never
/// be extended.
pub fn cut(&mut self) {
for lit in &mut self.0 {
lit.cut = true;
}
}
/// Reverses all members in place.
pub fn reverse(&mut self) {
for lit in &mut self.0 {
lit.reverse();
}
}
/// Pops all 'complete' literals out of this set.
fn remove_complete(&mut self) -> Vec<Literal<I>> {
let mut output = Vec::new();
for seq in mem::take(&mut self.0) {
if seq.cut {
self.0.push(seq);
} else {
output.push(seq);
}
}
output
}
}
#[unconst]
const fn prefixes<I: ~const Integral>(expr: &Repr<I>, set: &mut Set<I>)
where
I: ~const Integral,
{
match expr {
Repr::Zero(_) => {}
One(seq) => set.cross_add(&seq),
Repr::Interval(ref interval) => set.add_interval(interval),
Or(ref lhs, ref rhs) => {
let mut lits2 = Set::empty();
for e in [lhs, rhs] {
let mut lits3 = Set::empty();
prefixes(e, &mut lits3);
if lits3.is_empty() || !lits2.union(lits3) {
// If we couldn't find suffixes for *any* of the
// alternates, then the entire alternation has to be thrown
// away and any existing members must be frozen. Similarly,
// if the union couldn't complete, stop and freeze.
set.cut();
}
}
if !set.cross_product(&lits2) {
set.cut();
}
}
Inf(repr) => {
let (mut lits2, mut lits3) = (set.clone(), Set::empty());
prefixes(repr, &mut lits3);
if lits3.is_empty() || !lits2.cross_product(&lits3) {
set.cut();
return;
}
lits2.cut();
lits2.add(Literal::empty());
set.union(lits2);
}
And(ref lhs, ref rhs) => {
for e in [lhs, rhs] {
if let Repr::Zero(Zero::StartText) = **e {
if !set.is_empty() {
set.cut();
break;
}
set.add(Literal::empty());
continue;
}
let mut lits2 = Set::empty();
prefixes(e, &mut lits2);
if !set.cross_product(&lits2) || !lits2.any_complete() {
// If this expression couldn't yield any literal that
// could be extended, then we need to quit. Since we're
// short-circuiting, we also need to freeze every member.
set.cut();
break;
}
}
}
_ => set.cut(),
}
}
#[unconst]
impl<I: ~const Integral> Literal<I> {
/// Returns a new complete literal with the bytes given.
pub fn new(seq: Seq<I>) -> Literal<I> {
Literal {
seq: seq.into(),
cut: false,
}
}
/// Returns a new complete empty literal.
pub fn empty() -> Literal<I> {
Literal {
seq: Seq::empty(),
cut: false,
}
}
}
#[unconst]
impl<I: ~const Integral> Deref for Literal<I> {
type Target = Seq<I>;
fn deref(&self) -> &Seq<I> {
&self.seq
}
}
#[unconst]
const fn position<I>(needle: &[I], mut context: &[I]) -> Option<usize>
where
I: ~const Integral,
{
let mut i = 0;
while context.len() >= needle.len() {
if needle == &context[..needle.len()] {
return Some(i);
}
i += 1;
context = &context[1..];
}
None
}
fn char_len_lossy(bytes: &[u8]) -> usize {
String::from_utf8_lossy(bytes).chars().count()
}
#[unconst]
/// Parsed represents a flattened Repr and their detected seqs.
pub struct Parsed<I: ~const Integral> {
pub reprs: Vec<Repr<I>>,
pub prefixes: Set<I>,
pub suffixes: Set<I>,
}
#[unconst]
impl<I: ~const Integral> Parsed<I> {
/// Parse the current set of patterns into their AST and extract seqs.
pub fn parse(repr: &Repr<I>) -> Parsed<I> {
let mut prefixes = Some(Set::empty());
let mut suffixes = Some(Set::empty());
let is_set = true;
// If we're compiling a regex set and that set has any anchored
// expressions, then disable all literal optimizations.
let mut reprs = Vec::new();
let mut current = repr;
while let Add(lhs, rhs) = current {
if !repr.is_anchored_start() && repr.is_any_anchored_start() {
// Partial anchors unfortunately make it hard to use
// prefixes, so disable them.
prefixes = None;
} else if repr.is_anchored_start() {
// Regex sets with anchors do not go well with literal
// optimizations.
prefixes = None;
}
prefixes = prefixes.and_then(|mut prefixes| {
if !prefixes.union_prefixes(&repr) {
None
} else {
Some(prefixes)
}
});
if !repr.is_anchored_end() && repr.is_any_anchored_end() {
// Partial anchors unfortunately make it hard to use
// suffixes, so disable them.
suffixes = None;
} else if repr.is_anchored_end() {
// Regex sets with anchors do not go well with literal
// optimizations.
suffixes = None;
}
suffixes = suffixes.and_then(|mut suffixes| {
if !suffixes.union_suffixes(&repr) {
None
} else {
Some(suffixes)
}
});
reprs.push(repr);
}
Parsed {
reprs,
prefixes: prefixes.unwrap_or_else(Set::empty),
suffixes: suffixes.unwrap_or_else(Set::empty),
}
}
}