1use crate::bigram_filter::BigramFilter;
16use regex_syntax::hir::{Class, Hir, HirKind};
17use smallvec::SmallVec;
18use std::borrow::Cow;
19
20const MAX_CLASS_EXPAND: usize = 16;
23
24#[inline]
25fn consec_key(a: u8, b: u8) -> Option<u16> {
26 let al = a.to_ascii_lowercase();
27 let bl = b.to_ascii_lowercase();
28 if (32..=126).contains(&al) && (32..=126).contains(&bl) {
29 Some((al as u16) << 8 | bl as u16)
30 } else {
31 None
32 }
33}
34
35#[derive(Debug, Clone)]
36pub enum BigramQuery {
37 Any,
38 Consec(u16),
40 Skip1(u16),
42 And(Vec<BigramQuery>),
44 Or(Vec<BigramQuery>),
46}
47
48#[inline]
50fn bitset_or(a: &mut [u64], b: &[u64]) {
51 a.iter_mut().zip(b.iter()).for_each(|(x, y)| *x |= *y);
52}
53
54#[inline]
56fn bitset_and(a: &mut [u64], b: &[u64]) {
57 a.iter_mut().zip(b.iter()).for_each(|(x, y)| *x &= *y);
58}
59
60impl BigramQuery {
61 pub fn is_any(&self) -> bool {
62 matches!(self, BigramQuery::Any)
63 }
64
65 pub(crate) fn evaluate(&self, index: &BigramFilter) -> Option<Vec<u64>> {
66 self.evaluate_cow(index).map(Cow::into_owned)
67 }
68
69 fn evaluate_cow<'a>(&self, index: &'a BigramFilter) -> Option<Cow<'a, [u64]>> {
70 match self {
71 BigramQuery::Any => None,
72
73 BigramQuery::Consec(key) => {
74 let col = index.lookup()[*key as usize];
75 if col == u16::MAX {
76 return None;
77 }
78 let words = index.words();
79 let offset = col as usize * words;
80 let data = index.dense_data();
81 if offset + words > data.len() {
82 return None;
83 }
84 Some(Cow::Borrowed(&data[offset..offset + words]))
85 }
86
87 BigramQuery::Skip1(key) => {
88 let skip = index.skip_index()?;
89 let col = skip.lookup()[*key as usize];
90 if col == u16::MAX {
91 return None;
92 }
93 let words = skip.words();
94 let offset = col as usize * words;
95 let data = skip.dense_data();
96 if offset + words > data.len() {
97 return None;
98 }
99 Some(Cow::Borrowed(&data[offset..offset + words]))
100 }
101
102 BigramQuery::And(children) => {
103 let mut result: Option<Vec<u64>> = None;
104 for child in children {
105 if let Some(child_bits) = child.evaluate_cow(index) {
106 result = Some(match result {
107 None => child_bits.into_owned(),
108 Some(mut r) => {
109 bitset_and(&mut r, &child_bits);
110 r
111 }
112 });
113 }
114 }
115 result.map(Cow::Owned)
116 }
117
118 BigramQuery::Or(children) => {
119 if children.is_empty() {
120 return None;
121 }
122 let mut result: Option<Vec<u64>> = None;
123 for child in children {
124 match child.evaluate_cow(index) {
125 None => return None,
127 Some(child_bits) => {
128 result = Some(match result {
129 None => child_bits.into_owned(),
130 Some(mut r) => {
131 bitset_or(&mut r, &child_bits);
132 r
133 }
134 });
135 }
136 }
137 }
138 result.map(Cow::Owned)
139 }
140 }
141 }
142}
143
144struct HirInfo {
146 query: BigramQuery,
147 first: Option<SmallVec<[u8; MAX_CLASS_EXPAND]>>,
149 last: Option<SmallVec<[u8; MAX_CLASS_EXPAND]>>,
151 can_be_empty: bool,
153}
154
155impl HirInfo {
156 fn empty() -> Self {
157 Self {
158 query: BigramQuery::Any,
159 first: None,
160 last: None,
161 can_be_empty: true,
162 }
163 }
164}
165
166pub(crate) fn fuzzy_to_bigram_query(query: &str, num_probes: usize) -> BigramQuery {
170 let lower: Vec<u8> = query.bytes().map(|b| b.to_ascii_lowercase()).collect();
171
172 if lower.len() < 2 {
173 return BigramQuery::Any;
174 }
175
176 let max_typos = (lower.len() / 3).min(2);
177
178 let bigram_keys: Vec<u16> = lower
180 .windows(2)
181 .filter_map(|w| consec_key(w[0], w[1]))
182 .collect();
183
184 if bigram_keys.is_empty() {
185 return BigramQuery::Any;
186 }
187
188 if max_typos == 0 {
190 return simplify_and(
191 bigram_keys
192 .iter()
193 .map(|&k| BigramQuery::Consec(k))
194 .collect(),
195 );
196 }
197
198 let n = num_probes.min(bigram_keys.len());
200 if n <= max_typos {
201 return simplify_or(
203 bigram_keys
204 .iter()
205 .map(|&k| BigramQuery::Consec(k))
206 .collect(),
207 );
208 }
209
210 let probes: Vec<u16> = if n == bigram_keys.len() {
211 bigram_keys
212 } else {
213 (0..n)
214 .map(|i| {
215 let idx = i * (bigram_keys.len() - 1) / (n - 1);
216 bigram_keys[idx]
217 })
218 .collect()
219 };
220
221 let required = n - max_typos;
222
223 if required >= n {
225 return simplify_and(probes.iter().map(|&k| BigramQuery::Consec(k)).collect());
226 }
227
228 let mut branches = Vec::new();
230 let mut combo = vec![0u16; required];
231 combine(&probes, required, 0, 0, &mut combo, &mut branches);
232
233 simplify_or(branches)
234}
235
236fn combine(
238 items: &[u16],
239 k: usize,
240 start: usize,
241 depth: usize,
242 combo: &mut [u16],
243 branches: &mut Vec<BigramQuery>,
244) {
245 if depth == k {
246 branches.push(simplify_and(
247 combo.iter().map(|&key| BigramQuery::Consec(key)).collect(),
248 ));
249 return;
250 }
251 let remaining = k - depth;
252 for i in start..=items.len() - remaining {
253 combo[depth] = items[i];
254 combine(items, k, i + 1, depth + 1, combo, branches);
255 }
256}
257
258pub(crate) fn regex_to_bigram_query(pattern: &str) -> BigramQuery {
259 let mut parser = regex_syntax::ParserBuilder::new()
260 .unicode(false)
261 .utf8(false)
262 .build();
263
264 let hir = match parser.parse(pattern) {
265 Ok(h) => h,
266 Err(_) => return BigramQuery::Any,
267 };
268
269 decompose(&hir).query
270}
271
272fn decompose(hir: &Hir) -> HirInfo {
273 let can_be_empty = hir.properties().minimum_len().is_none_or(|n| n == 0);
274
275 match hir.kind() {
276 HirKind::Empty => HirInfo::empty(),
277
278 HirKind::Literal(lit) => decompose_literal(lit.0.as_ref()),
279
280 HirKind::Class(class) => {
281 let bytes = expand_class(class);
282 match bytes {
283 Some(b) if !b.is_empty() => HirInfo {
284 query: BigramQuery::Any,
285 first: Some(b.clone()),
286 last: Some(b),
287 can_be_empty,
288 },
289 _ => HirInfo {
290 query: BigramQuery::Any,
291 first: None,
292 last: None,
293 can_be_empty,
294 },
295 }
296 }
297
298 HirKind::Look(_) => HirInfo::empty(),
299
300 HirKind::Repetition(rep) => {
301 let inner = decompose(&rep.sub);
302 if rep.min == 0 {
303 HirInfo {
304 query: BigramQuery::Any,
305 first: inner.first,
306 last: inner.last,
307 can_be_empty: true,
308 }
309 } else {
310 let mut qs = Vec::new();
312 if !inner.query.is_any() {
313 qs.push(inner.query.clone());
314 }
315 if rep.min >= 2 {
317 push_cross_consec(&mut qs, inner.last.as_deref(), inner.first.as_deref());
318 }
319 HirInfo {
320 query: simplify_and(qs),
321 first: inner.first,
322 last: inner.last,
323 can_be_empty,
324 }
325 }
326 }
327
328 HirKind::Capture(cap) => decompose(&cap.sub),
329
330 HirKind::Concat(parts) => decompose_concat(parts),
331
332 HirKind::Alternation(alts) => decompose_alternation(alts),
333 }
334}
335
336fn decompose_literal(bytes: &[u8]) -> HirInfo {
338 if bytes.is_empty() {
339 return HirInfo::empty();
340 }
341
342 let lower: SmallVec<[u8; 64]> = bytes.iter().map(|b| b.to_ascii_lowercase()).collect();
343
344 if lower.len() == 1 {
345 let b = lower[0];
346 let first = if (32..=126).contains(&b) {
347 Some(SmallVec::from_slice(&[b]))
348 } else {
349 None
350 };
351 return HirInfo {
352 query: BigramQuery::Any,
353 first: first.clone(),
354 last: first,
355 can_be_empty: false,
356 };
357 }
358
359 let mut qs: Vec<BigramQuery> = Vec::new();
360
361 for w in lower.windows(2) {
363 if let Some(k) = consec_key(w[0], w[1]) {
364 qs.push(BigramQuery::Consec(k));
365 }
366 }
367
368 if lower.len() >= 3 {
370 for i in 0..lower.len() - 2 {
371 if let Some(k) = consec_key(lower[i], lower[i + 2]) {
372 qs.push(BigramQuery::Skip1(k));
373 }
374 }
375 }
376
377 let first_byte = lower[0];
378 let last_byte = *lower.last().unwrap();
379
380 HirInfo {
381 query: simplify_and(qs),
382 first: if (32..=126).contains(&first_byte) {
383 Some(SmallVec::from_slice(&[first_byte]))
384 } else {
385 None
386 },
387 last: if (32..=126).contains(&last_byte) {
388 Some(SmallVec::from_slice(&[last_byte]))
389 } else {
390 None
391 },
392 can_be_empty: false,
393 }
394}
395
396fn decompose_concat(parts: &[Hir]) -> HirInfo {
397 if parts.is_empty() {
398 return HirInfo::empty();
399 }
400
401 let infos: Vec<HirInfo> = parts.iter().map(decompose).collect();
402 let mut qs: Vec<BigramQuery> = Vec::new();
403
404 for info in &infos {
406 if !info.query.is_any() {
407 qs.push(info.query.clone());
408 }
409 }
410
411 for pair in infos.windows(2) {
413 if !pair[0].can_be_empty && !pair[1].can_be_empty {
414 push_cross_consec(&mut qs, pair[0].last.as_deref(), pair[1].first.as_deref());
415 }
416 }
417
418 if parts.len() >= 3 {
421 for i in 0..parts.len() - 2 {
422 let left = &infos[i];
423 let mid = &parts[i + 1];
424 let right = &infos[i + 2];
425
426 let min_len = mid.properties().minimum_len();
427 let max_len = mid.properties().maximum_len();
428 let is_1byte = min_len == Some(1) && max_len == Some(1);
429
430 if is_1byte && !left.can_be_empty && !right.can_be_empty {
431 push_cross_skip1(&mut qs, left.last.as_deref(), right.first.as_deref());
432 }
433 }
434 }
435
436 let first = collect_first(&infos);
437 let last = collect_last(&infos);
438 let can_be_empty = infos.iter().all(|i| i.can_be_empty);
439
440 HirInfo {
441 query: simplify_and(qs),
442 first,
443 last,
444 can_be_empty,
445 }
446}
447
448fn decompose_alternation(alts: &[Hir]) -> HirInfo {
449 if alts.is_empty() {
450 return HirInfo::empty();
451 }
452
453 let infos: Vec<HirInfo> = alts.iter().map(decompose).collect();
454 let query = simplify_or(infos.iter().map(|i| i.query.clone()).collect());
455 let first = merge_byte_sets(infos.iter().map(|i| &i.first));
456 let last = merge_byte_sets(infos.iter().map(|i| &i.last));
457 let can_be_empty = infos.iter().any(|i| i.can_be_empty);
458
459 HirInfo {
460 query,
461 first,
462 last,
463 can_be_empty,
464 }
465}
466
467fn expand_class(class: &Class) -> Option<SmallVec<[u8; MAX_CLASS_EXPAND]>> {
468 let mut bytes: SmallVec<[u8; MAX_CLASS_EXPAND]> = SmallVec::new();
469 match class {
470 Class::Bytes(bc) => {
471 for range in bc.ranges() {
472 let count = (range.end() as usize) - (range.start() as usize) + 1;
473 if bytes.len() + count > MAX_CLASS_EXPAND {
474 return None;
475 }
476 for b in range.start()..=range.end() {
477 if (32..=126).contains(&b) {
478 let lower = b.to_ascii_lowercase();
479 if !bytes.contains(&lower) {
480 bytes.push(lower);
481 }
482 }
483 }
484 }
485 }
486 Class::Unicode(uc) => {
487 for range in uc.ranges() {
488 let start = range.start() as u32;
489 let end = range.end() as u32;
490 if start > 127 {
491 continue;
492 }
493 let ascii_end = end.min(126) as u8;
494 let ascii_start = start.max(32) as u8;
495 if ascii_start > ascii_end {
496 continue;
497 }
498 let count = (ascii_end - ascii_start) as usize + 1;
499 if bytes.len() + count > MAX_CLASS_EXPAND {
500 return None;
501 }
502 for b in ascii_start..=ascii_end {
503 let lower = b.to_ascii_lowercase();
504 if !bytes.contains(&lower) {
505 bytes.push(lower);
506 }
507 }
508 }
509 }
510 }
511 if bytes.is_empty() { None } else { Some(bytes) }
512}
513
514fn push_cross_consec(qs: &mut Vec<BigramQuery>, last: Option<&[u8]>, first: Option<&[u8]>) {
516 if let Some(q) = cross_product(last, first, false) {
517 qs.push(q);
518 }
519}
520
521fn push_cross_skip1(qs: &mut Vec<BigramQuery>, last: Option<&[u8]>, first: Option<&[u8]>) {
523 if let Some(q) = cross_product(last, first, true) {
524 qs.push(q);
525 }
526}
527
528fn cross_product(last: Option<&[u8]>, first: Option<&[u8]>, skip: bool) -> Option<BigramQuery> {
529 let last = last?;
530 let first = first?;
531 let n = last.len() * first.len();
532 if n == 0 || n > MAX_CLASS_EXPAND * MAX_CLASS_EXPAND {
533 return None;
534 }
535
536 let mut bigrams: Vec<BigramQuery> = Vec::with_capacity(n);
537 for &l in last {
538 for &f in first {
539 if let Some(k) = consec_key(l, f) {
540 let node = if skip {
541 BigramQuery::Skip1(k)
542 } else {
543 BigramQuery::Consec(k)
544 };
545 bigrams.push(node);
546 }
547 }
548 }
549
550 match bigrams.len() {
551 0 => None,
552 1 => Some(bigrams.into_iter().next().unwrap()),
553 _ => Some(simplify_or(bigrams)),
554 }
555}
556
557fn collect_first(infos: &[HirInfo]) -> Option<SmallVec<[u8; MAX_CLASS_EXPAND]>> {
558 let mut result: SmallVec<[u8; MAX_CLASS_EXPAND]> = SmallVec::new();
559 for info in infos {
560 if let Some(ref bytes) = info.first {
561 for &b in bytes {
562 if !result.contains(&b) {
563 if result.len() >= MAX_CLASS_EXPAND {
564 return None;
565 }
566 result.push(b);
567 }
568 }
569 } else if !info.can_be_empty {
570 return None;
571 }
572 if !info.can_be_empty {
573 break;
574 }
575 }
576 if result.is_empty() {
577 None
578 } else {
579 Some(result)
580 }
581}
582
583fn collect_last(infos: &[HirInfo]) -> Option<SmallVec<[u8; MAX_CLASS_EXPAND]>> {
584 let mut result: SmallVec<[u8; MAX_CLASS_EXPAND]> = SmallVec::new();
585 for info in infos.iter().rev() {
586 if let Some(ref bytes) = info.last {
587 for &b in bytes {
588 if !result.contains(&b) {
589 if result.len() >= MAX_CLASS_EXPAND {
590 return None;
591 }
592 result.push(b);
593 }
594 }
595 } else if !info.can_be_empty {
596 return None;
597 }
598 if !info.can_be_empty {
599 break;
600 }
601 }
602 if result.is_empty() {
603 None
604 } else {
605 Some(result)
606 }
607}
608
609fn merge_byte_sets<'a>(
610 iter: impl Iterator<Item = &'a Option<SmallVec<[u8; MAX_CLASS_EXPAND]>>>,
611) -> Option<SmallVec<[u8; MAX_CLASS_EXPAND]>> {
612 let mut result: SmallVec<[u8; MAX_CLASS_EXPAND]> = SmallVec::new();
613 for opt in iter {
614 match opt {
615 None => return None,
616 Some(bytes) => {
617 for &b in bytes {
618 if !result.contains(&b) {
619 if result.len() >= MAX_CLASS_EXPAND {
620 return None;
621 }
622 result.push(b);
623 }
624 }
625 }
626 }
627 }
628 if result.is_empty() {
629 None
630 } else {
631 Some(result)
632 }
633}
634
635fn simplify_and(children: Vec<BigramQuery>) -> BigramQuery {
636 let mut flat: Vec<BigramQuery> = Vec::new();
637 for child in children {
638 match child {
639 BigramQuery::Any => {}
640 BigramQuery::And(inner) => flat.extend(inner),
641 other => flat.push(other),
642 }
643 }
644 match flat.len() {
645 0 => BigramQuery::Any,
646 1 => flat.into_iter().next().unwrap(),
647 _ => BigramQuery::And(flat),
648 }
649}
650
651fn simplify_or(children: Vec<BigramQuery>) -> BigramQuery {
652 if children.iter().any(|c| c.is_any()) {
653 return BigramQuery::Any;
654 }
655 let mut flat: Vec<BigramQuery> = Vec::new();
656 for child in children {
657 match child {
658 BigramQuery::Or(inner) => flat.extend(inner),
659 other => flat.push(other),
660 }
661 }
662 match flat.len() {
663 0 => BigramQuery::Any,
664 1 => flat.into_iter().next().unwrap(),
665 _ => BigramQuery::Or(flat),
666 }
667}
668
669#[cfg(test)]
670mod tests {
671 use super::*;
672 use crate::bigram_filter::BigramIndexBuilder;
673
674 fn build_test_index(files: &[&[u8]]) -> BigramFilter {
676 let n = files.len();
677 let consec_builder = BigramIndexBuilder::new(n);
678 let skip_builder = BigramIndexBuilder::new(n);
679 for (i, content) in files.iter().enumerate() {
680 consec_builder.add_file_content(&skip_builder, i, content);
681 }
682 let mut idx = consec_builder.compress(Some(0));
683 idx.set_skip_index(skip_builder.compress(Some(0)));
684 idx
685 }
686
687 #[test]
688 fn literal_pattern() {
689 let idx = build_test_index(&[
690 b"hello world", b"goodbye world", b"say hello there", ]);
694
695 let q = regex_to_bigram_query("hello");
696 assert!(!q.is_any());
697
698 let candidates = q.evaluate(&idx).unwrap();
699 assert!(BigramFilter::is_candidate(&candidates, 0));
700 assert!(!BigramFilter::is_candidate(&candidates, 1));
701 assert!(BigramFilter::is_candidate(&candidates, 2));
702 }
703
704 #[test]
705 fn alternation() {
706 let idx = build_test_index(&[
707 b"has foo in it", b"has bar in it", b"has xyz in it", ]);
711
712 let q = regex_to_bigram_query("foo|bar");
713 assert!(!q.is_any());
714
715 let candidates = q.evaluate(&idx).unwrap();
716 assert!(BigramFilter::is_candidate(&candidates, 0));
717 assert!(BigramFilter::is_candidate(&candidates, 1));
718 assert!(!BigramFilter::is_candidate(&candidates, 2));
720 }
721
722 #[test]
723 fn wildcard_concat() {
724 let idx = build_test_index(&[
725 b"foo something bar", b"foo only", b"only bar", ]);
729
730 let q = regex_to_bigram_query("foo.*bar");
731 assert!(!q.is_any());
732
733 let candidates = q.evaluate(&idx).unwrap();
734 assert!(BigramFilter::is_candidate(&candidates, 0));
735 assert!(!BigramFilter::is_candidate(&candidates, 1));
737 assert!(!BigramFilter::is_candidate(&candidates, 2));
738 }
739
740 #[test]
741 fn sparse1_across_dot() {
742 let idx = build_test_index(&[
744 b"axb", b"ayb", b"xyz", ]);
748
749 let q = regex_to_bigram_query("a.b");
750 assert!(!q.is_any());
751
752 let candidates = q.evaluate(&idx).unwrap();
753 assert!(BigramFilter::is_candidate(&candidates, 0));
754 assert!(BigramFilter::is_candidate(&candidates, 1));
755 assert!(!BigramFilter::is_candidate(&candidates, 2));
756 }
757
758 #[test]
759 fn sparse1_across_digit() {
760 let idx = build_test_index(&[
762 b"foo3bar baz", b"foobar baz", b"xyz only", ]);
766
767 let q = regex_to_bigram_query(r"foo\dbar");
768 assert!(!q.is_any());
769
770 let candidates = q.evaluate(&idx).unwrap();
771 assert!(BigramFilter::is_candidate(&candidates, 0));
772 assert!(!BigramFilter::is_candidate(&candidates, 2));
776 }
777
778 #[test]
779 fn pure_wildcard_is_any() {
780 let q = regex_to_bigram_query(".*");
781 assert!(q.is_any());
782 }
783
784 #[test]
785 fn single_char_is_any() {
786 let q = regex_to_bigram_query("a");
787 assert!(q.is_any());
788 }
789
790 #[test]
791 fn invalid_regex_is_any() {
792 let q = regex_to_bigram_query("[invalid");
793 assert!(q.is_any());
794 }
795
796 #[test]
797 fn optional_group_excluded() {
798 let q = regex_to_bigram_query("foo(bar)?baz");
800 assert!(!q.is_any());
801
802 let idx = build_test_index(&[
803 b"foobaz content", b"foobarbaz content", b"xyz only", ]);
807
808 let candidates = q.evaluate(&idx).unwrap();
809 assert!(BigramFilter::is_candidate(&candidates, 0));
810 assert!(BigramFilter::is_candidate(&candidates, 1));
811 assert!(!BigramFilter::is_candidate(&candidates, 2));
812 }
813
814 #[test]
815 fn repetition_min2_cross_boundary() {
816 let q = regex_to_bigram_query("(ab){2,}");
818 assert!(!q.is_any());
819
820 let idx = build_test_index(&[
821 b"ababab", b"abonly", b"xyz", ]);
825
826 let candidates = q.evaluate(&idx).unwrap();
827 assert!(BigramFilter::is_candidate(&candidates, 0));
828 assert!(!BigramFilter::is_candidate(&candidates, 2));
829 }
830
831 #[test]
832 fn two_dots_no_sparse1() {
833 let q = regex_to_bigram_query("a..b");
836 assert!(q.is_any());
838 }
839
840 #[test]
841 fn character_class_cross_boundary() {
842 let idx = build_test_index(&[
847 b"ade content", b"bde content", b"cde content", b"xde content", ]);
852
853 let q = regex_to_bigram_query("[abc]de");
854 assert!(!q.is_any());
855
856 let candidates = q.evaluate(&idx).unwrap();
857 assert!(BigramFilter::is_candidate(&candidates, 0));
858 assert!(BigramFilter::is_candidate(&candidates, 1));
859 assert!(BigramFilter::is_candidate(&candidates, 2));
860 assert!(!BigramFilter::is_candidate(&candidates, 3));
862 }
863
864 fn has_consec(q: &BigramQuery, a: u8, b: u8) -> bool {
867 let Some(key) = consec_key(a, b) else {
868 return false;
869 };
870 match q {
871 BigramQuery::Consec(k) => *k == key,
872 BigramQuery::And(cs) | BigramQuery::Or(cs) => cs.iter().any(|c| has_consec(c, a, b)),
873 _ => false,
874 }
875 }
876
877 fn has_skip1(q: &BigramQuery, a: u8, b: u8) -> bool {
878 let Some(key) = consec_key(a, b) else {
879 return false;
880 };
881 match q {
882 BigramQuery::Skip1(k) => *k == key,
883 BigramQuery::And(cs) | BigramQuery::Or(cs) => cs.iter().any(|c| has_skip1(c, a, b)),
884 _ => false,
885 }
886 }
887
888 type Bg = (&'static str, bool);
891 const C: bool = false;
892 const S: bool = true;
893
894 #[test]
902 fn common_regex_patterns() {
903 #[rustfmt::skip]
904 let cases: &[(&str, Option<&[Bg]>)] = &[
905 (r"^\d+$", None), (r"^\d*\.\d+$", None), (r"^\d*(\.\d+)?$", None), (r"^-?\d*(\.\d+)?$", None), (r"[-]?[0-9]+[,.]?[0-9]*([/][0-9]+[,.]?[0-9]*)*", None), (r"^[a-zA-Z0-9]*$", None), (r"^[a-zA-Z0-9 ]*$", None), (r"^([a-zA-Z0-9._%-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})*$", None), (r"^([a-z0-9_\.\+-]+)@([\da-z\.-]+)\.([a-z\.]{2,6})$", None), (r"(?=(.*[0-9]))(?=.*[!@#$%^&*()\[\]{}\-_+=~`|:;<>,./?\x5c])(?=.*[a-z])(?=(.*[A-Z]))(?=(.*)).{8,}", None), (r"(?=(.*[0-9]))((?=.*[A-Za-z0-9])(?=.*[A-Z])(?=.*[a-z]))^.{8,}$", None), (r"^[a-z0-9_-]{3,16}$", None), (r"(https?://)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", None), (r"^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$", None), (r"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))", None), (r"[12]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])", None), (r"^(0?[1-9]|1[0-2]):[0-5][0-9]$", None), (r"((1[0-2]|0?[1-9]):([0-5][0-9]) ?([AaPp][Mm]))", None), (r"^(0[0-9]|1[0-9]|2[0-3]):[0-5][0-9]$", None), (r"^([0-9]|0[0-9]|1[0-9]|2[0-3]):[0-5][0-9]$", None), (r"(?:[01]\d|2[0123]):(?:[012345]\d):(?:[012345]\d)", None), (r"</?[\w\s]*>|<.+[\W]>", None), (r"\bon\w+=\S+(?=.*>)", None), (r"^[a-z0-9]+(?:-[a-z0-9]+)*$", None), (r"(\b\w+\b)(?=.*\b\1\b)", None), (r"^[\w,\s-]+\.[A-Za-z]{3}$", None), (r"^[A-PR-WY][1-9]\d\s?\d{4}[1-9]$", None), (r"https?://(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", Some(&[
938 ("ht", C), ("tt", C), ("tp", C), ("ht", S), ("tp", S), (":/", C), ("//", C), ])),
942
943 (r"fn\s+\w+", Some(&[
945 ("fn", C), ("n ", C), ])),
948
949 (r"use\s+crate::", Some(&[
951 ("us", C), ("se", C), ("ue", S), ("cr", C), ("ra", C), ("at", C), ("te", C), ("::", C),
954 ("ca", S), ("rt", S), ("ae", S), ])),
956
957 (r"unwrap\(\)|expect\(", Some(&[
959 ("nw", C), ("wr", C), ("ra", C), ("ap", C), ("p(", C),
961 ("xp", C), ("pe", C), ("ec", C), ("ct", C), ("t(", C),
963 ])),
964
965 (r"TODO|FIXME|HACK", Some(&[
967 ("to", C), ("od", C), ("do", C), ("fi", C), ("ix", C), ("xm", C), ("me", C),
970 ("ha", C), ("ac", C), ("ck", C), ("hc", S), ("ak", S), ])),
973 ];
974
975 for (i, &(pattern, expected)) in cases.iter().enumerate() {
976 let q = regex_to_bigram_query(pattern);
977
978 if let Some(bigrams) = expected {
979 assert!(
980 !q.is_any(),
981 "#{i} {pattern:?}: expected bigrams but got Any"
982 );
983
984 for &(pair, skip) in bigrams {
985 let b = pair.as_bytes();
986 debug_assert_eq!(b.len(), 2, "bigram must be 2 chars: {pair:?}");
987 let found = if skip {
988 has_skip1(&q, b[0], b[1])
989 } else {
990 has_consec(&q, b[0], b[1])
991 };
992 let kind = if skip { "skip-1" } else { "consec" };
993 assert!(found, "#{i} {pattern:?}: missing {kind} bigram {pair:?}");
994 }
995 }
996 }
997 }
998}