1use crate::license_detection::index::LicenseIndex;
7use crate::license_detection::index::dictionary::{KnownToken, QueryToken, TokenId, TokenKind};
8use crate::license_detection::models::PositionSpan;
9use crate::license_detection::position_set::PositionSet;
10use crate::license_detection::spdx_lid::split_spdx_lid;
11use crate::license_detection::tokenize::STOPWORDS;
12use crate::license_detection::tokenize::tokenize_as_ids;
13use regex::Regex;
14use std::cell::{OnceCell, RefCell};
15use std::collections::HashMap;
16use std::sync::LazyLock;
17use std::time::Instant;
18
19static QUERY_PATTERN: LazyLock<Regex> =
20 LazyLock::new(|| Regex::new(r"[^_\W]+\+?[^_\W]*").expect("valid query regex"));
21static MATCHED_TEXT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
22 Regex::new(r"(?P<token>[^_\W]+\+?[^_\W]*)|(?P<punct>[_\W\s\+]+[_\W\s]?)")
23 .expect("valid matched text regex")
24});
25
26#[derive(Clone)]
27struct MatchedTextToken {
28 value: String,
29 line_num: usize,
30 pos: Option<usize>,
31 is_text: bool,
32 is_matched: bool,
33}
34
35#[derive(Debug)]
47pub struct Query<'a> {
48 pub text: String,
52
53 pub tokens: Vec<TokenId>,
57
58 pub line_by_pos: Vec<usize>,
65
66 pub unknowns_by_pos: HashMap<Option<usize>, usize>,
74
75 pub stopwords_by_pos: HashMap<Option<usize>, usize>,
81
82 pub shorts_and_digits_pos: PositionSet,
88
89 pub high_matchables: PositionSet,
95
96 pub low_matchables: PositionSet,
102
103 pub is_binary: bool,
107
108 pub(crate) query_run_ranges: Vec<(usize, Option<usize>)>,
114
115 pub spdx_lines: Vec<(String, usize, usize)>,
122
123 pub index: &'a LicenseIndex,
125}
126
127pub fn matched_text_from_text(text: &str, start_line: usize, end_line: usize) -> String {
128 if start_line == 0 || end_line == 0 || start_line > end_line {
129 return String::new();
130 }
131
132 text.lines()
133 .enumerate()
134 .filter_map(|(idx, line)| {
135 let line_num = idx + 1;
136 if line_num >= start_line && line_num <= end_line {
137 Some(line)
138 } else {
139 None
140 }
141 })
142 .collect::<Vec<_>>()
143 .join("\n")
144}
145
146pub fn matched_text_diagnostics_from_text(
147 text: &str,
148 query: &Query<'_>,
149 matched_positions: &PositionSet,
150 start_pos: usize,
151 end_pos: usize,
152 start_line: usize,
153 end_line: usize,
154) -> String {
155 let tokens = tokenize_matched_text(text, query);
156 let reportable_tokens = collect_reportable_tokens(
157 tokens,
158 matched_positions,
159 start_pos,
160 end_pos,
161 start_line,
162 end_line,
163 );
164 let line_endings = collect_line_endings(text);
165
166 render_diagnostic_tokens(&reportable_tokens, &line_endings)
167}
168
169fn tokenize_matched_text(text: &str, query: &Query<'_>) -> Vec<MatchedTextToken> {
170 let mut tokens = Vec::new();
171 let mut pos = 0usize;
172 for (line_num, line) in (1usize..).zip(text.split_inclusive('\n')) {
173 for capture in MATCHED_TEXT_PATTERN.captures_iter(line) {
174 if let Some(token_match) = capture.name("token") {
175 let token_text = token_match.as_str();
176 let retokenized: Vec<String> = QUERY_PATTERN
177 .find_iter(&token_text.to_lowercase())
178 .map(|m| m.as_str().to_string())
179 .filter(|token| !STOPWORDS.contains(token.as_str()))
180 .collect();
181
182 if retokenized.is_empty() {
183 tokens.push(MatchedTextToken {
184 value: token_text.to_string(),
185 line_num,
186 pos: None,
187 is_text: true,
188 is_matched: false,
189 });
190 } else if retokenized.len() == 1 {
191 let token = &retokenized[0];
192 let token_pos = if query.index.dictionary.get(token).is_some() {
193 let current_pos = pos;
194 pos += 1;
195 Some(current_pos)
196 } else {
197 None
198 };
199
200 tokens.push(MatchedTextToken {
201 value: token_text.to_string(),
202 line_num,
203 pos: token_pos,
204 is_text: true,
205 is_matched: false,
206 });
207 } else {
208 for token in retokenized {
209 let token_pos = if query.index.dictionary.get(&token).is_some() {
210 let current_pos = pos;
211 pos += 1;
212 Some(current_pos)
213 } else {
214 None
215 };
216
217 tokens.push(MatchedTextToken {
218 value: token,
219 line_num,
220 pos: token_pos,
221 is_text: true,
222 is_matched: false,
223 });
224 }
225 }
226 } else if let Some(punct_match) = capture.name("punct") {
227 tokens.push(MatchedTextToken {
228 value: punct_match.as_str().to_string(),
229 line_num,
230 pos: None,
231 is_text: false,
232 is_matched: false,
233 });
234 }
235 }
236 }
237
238 tokens
239}
240
241fn collect_reportable_tokens(
242 tokens: Vec<MatchedTextToken>,
243 matched_positions: &PositionSet,
244 start_pos: usize,
245 end_pos: usize,
246 start_line: usize,
247 end_line: usize,
248) -> Vec<MatchedTextToken> {
249 let mut reportable = Vec::new();
250 let mut started = false;
251 let mut finished = false;
252 let mut end_real_pos = None;
253 let mut last_real_pos = None;
254
255 for (real_pos, mut token) in tokens.into_iter().enumerate() {
256 if token.line_num < start_line {
257 continue;
258 }
259
260 if token.line_num > end_line {
261 break;
262 }
263
264 let mut is_included = false;
265
266 if token.pos.is_some_and(|pos| matched_positions.contains(pos)) {
267 token.is_matched = true;
268 is_included = true;
269 }
270
271 if !started && token.pos == Some(start_pos) {
272 started = true;
273 is_included = true;
274 }
275
276 if started && !finished {
277 is_included = true;
278 }
279
280 if token.pos == Some(end_pos) {
281 finished = true;
282 started = false;
283 end_real_pos = Some(real_pos);
284 }
285
286 if finished && !started && end_real_pos.is_some() && last_real_pos == end_real_pos {
287 end_real_pos = None;
288 if !token.is_text && !token.value.trim().is_empty() {
289 is_included = true;
290 }
291 }
292
293 last_real_pos = Some(real_pos);
294
295 if is_included {
296 reportable.push(token);
297 }
298 }
299
300 reportable
301}
302
303fn collect_line_endings(text: &str) -> Vec<String> {
304 text.split_inclusive('\n')
305 .map(|line| {
306 if line.ends_with("\r\n") {
307 "\r\n".to_string()
308 } else if line.ends_with('\n') {
309 "\n".to_string()
310 } else {
311 String::new()
312 }
313 })
314 .collect()
315}
316
317fn render_diagnostic_tokens(tokens: &[MatchedTextToken], line_endings: &[String]) -> String {
318 let mut rendered = String::new();
319 let mut previous_line: Option<usize> = None;
320
321 for token in tokens {
322 if let Some(prev_line) = previous_line
323 && token.line_num > prev_line
324 {
325 for line in prev_line..token.line_num {
326 if let Some(line_ending) = line_endings.get(line.saturating_sub(1)) {
327 rendered.push_str(line_ending.as_str());
328 }
329 }
330 }
331
332 let token_value = if token.is_text {
333 token.value.as_str()
334 } else {
335 token
336 .value
337 .strip_suffix("\r\n")
338 .or_else(|| token.value.strip_suffix('\n'))
339 .unwrap_or(token.value.as_str())
340 };
341
342 if token.is_text && !STOPWORDS.contains(token.value.to_lowercase().as_str()) {
343 if token.is_matched {
344 rendered.push_str(token_value);
345 } else {
346 rendered.push('[');
347 rendered.push_str(token_value);
348 rendered.push(']');
349 }
350 } else {
351 rendered.push_str(token_value);
352 }
353
354 previous_line = Some(token.line_num);
355 }
356
357 rendered
358}
359
360impl<'a> Query<'a> {
361 const TEXT_LINE_THRESHOLD: usize = 15;
376 const BINARY_LINE_THRESHOLD: usize = 50;
377 const MAX_TOKEN_PER_LINE: usize = 25;
378
379 fn compute_spdx_offset(
380 tokens: &[QueryToken],
381 dictionary: &crate::license_detection::index::dictionary::TokenDictionary,
382 ) -> Option<usize> {
383 let get_known_id = |i: usize| -> Option<TokenId> {
384 match tokens.get(i)? {
385 QueryToken::Known(known) => Some(known.id),
386 _ => None,
387 }
388 };
389
390 let spdx_id = dictionary.get("spdx")?;
391 let license_id = dictionary.get("license")?;
392 let identifier_id = dictionary.get("identifier")?;
393 let licence_id = dictionary.get("licence");
394
395 let licenses_id = dictionary.get("licenses");
396 let nuget_id = dictionary.get("nuget");
397 let org_id = dictionary.get("org");
398
399 let is_spdx_prefix = |ids: [Option<TokenId>; 3]| -> bool {
400 ids.iter().all(|id| id.is_some())
401 && ids[0] == Some(spdx_id)
402 && (ids[1] == Some(license_id) || ids[1] == licence_id)
403 && ids[2] == Some(identifier_id)
404 };
405
406 let is_nuget_prefix = |ids: [Option<TokenId>; 3]| -> bool {
407 licenses_id.is_some()
408 && nuget_id.is_some()
409 && org_id.is_some()
410 && ids[0] == licenses_id
411 && ids[1] == Some(nuget_id.unwrap())
412 && ids[2] == Some(org_id.unwrap())
413 };
414
415 if tokens.len() >= 3 {
416 let first_three = [get_known_id(0), get_known_id(1), get_known_id(2)];
417 if is_spdx_prefix(first_three) || is_nuget_prefix(first_three) {
418 return Some(0);
419 }
420 }
421
422 if tokens.len() >= 4 {
423 let second_three = [get_known_id(1), get_known_id(2), get_known_id(3)];
424 if is_spdx_prefix(second_three) || is_nuget_prefix(second_three) {
425 return Some(1);
426 }
427 }
428
429 if tokens.len() >= 5 {
430 let third_three = [get_known_id(2), get_known_id(3), get_known_id(4)];
431 if is_spdx_prefix(third_three) || is_nuget_prefix(third_three) {
432 return Some(2);
433 }
434 }
435
436 None
437 }
438
439 pub fn from_extracted_text(
440 text: &str,
441 index: &'a LicenseIndex,
442 binary_derived: bool,
443 ) -> Result<Self, anyhow::Error> {
444 Self::from_extracted_text_with_deadline(text, index, binary_derived, None)
445 }
446
447 pub fn from_extracted_text_with_deadline(
448 text: &str,
449 index: &'a LicenseIndex,
450 binary_derived: bool,
451 deadline: Option<Instant>,
452 ) -> Result<Self, anyhow::Error> {
453 let line_threshold = if binary_derived {
454 Self::BINARY_LINE_THRESHOLD
455 } else {
456 Self::TEXT_LINE_THRESHOLD
457 };
458
459 Self::with_source_options(text, index, line_threshold, Some(binary_derived), deadline)
460 }
461
462 pub fn query_runs(&self) -> Vec<QueryRun<'_>> {
466 self.query_run_ranges
467 .iter()
468 .map(|&(start, end)| QueryRun::new(self, start, end))
469 .collect()
470 }
471
472 fn with_source_options(
473 text: &str,
474 index: &'a LicenseIndex,
475 line_threshold: usize,
476 binary_derived: Option<bool>,
477 deadline: Option<Instant>,
478 ) -> Result<Self, anyhow::Error> {
479 crate::license_detection::ensure_within_deadline(deadline)?;
480 let is_binary = match binary_derived {
481 Some(is_binary) => is_binary,
482 None => Self::detect_binary(text)?,
483 };
484 let has_long_lines = Self::detect_long_lines(text);
485
486 let mut tokens = Vec::new();
487 let mut line_by_pos = Vec::new();
488 let mut unknowns_by_pos: HashMap<Option<usize>, usize> = HashMap::new();
489 let mut stopwords_by_pos: HashMap<Option<usize>, usize> = HashMap::new();
490 let mut shorts_and_digits_pos = PositionSet::new();
491 let mut spdx_lines: Vec<(String, usize, usize)> = Vec::new();
492
493 let mut known_pos: Option<usize> = None;
494 let mut started = false;
495 let mut tokens_by_line: Vec<Vec<Option<KnownToken>>> = Vec::new();
496
497 for (current_line, (line_index, line)) in (1usize..).zip(text.lines().enumerate()) {
498 if line_index.is_multiple_of(128) {
499 crate::license_detection::ensure_within_deadline(deadline)?;
500 }
501
502 let line_trimmed = line.trim();
503 let mut line_tokens: Vec<Option<KnownToken>> = Vec::new();
504
505 let mut line_first_known_pos = None;
506
507 let line_query_tokens = tokenize_as_ids(line_trimmed, &index.dictionary);
508
509 for query_token in &line_query_tokens {
510 match query_token {
511 QueryToken::Known(known_token) => {
512 known_pos = Some(known_pos.map_or(0, |p| p + 1));
513 started = true;
514 tokens.push(known_token.id);
515 line_by_pos.push(current_line);
516 line_tokens.push(Some(*known_token));
517
518 if line_first_known_pos.is_none() {
519 line_first_known_pos = known_pos;
520 }
521
522 if known_token.is_short_or_digit {
523 let _ = shorts_and_digits_pos.insert(known_pos.unwrap());
524 }
525 }
526 QueryToken::Unknown if !started => {
527 *unknowns_by_pos.entry(None).or_insert(0) += 1;
528 line_tokens.push(None);
529 }
530 QueryToken::Unknown => {
531 *unknowns_by_pos.entry(known_pos).or_insert(0) += 1;
532 line_tokens.push(None);
533 }
534 QueryToken::Stopword if !started => {
535 *stopwords_by_pos.entry(None).or_insert(0) += 1;
536 }
537 QueryToken::Stopword => {
538 *stopwords_by_pos.entry(known_pos).or_insert(0) += 1;
539 }
540 }
541 }
542
543 let line_last_known_pos = known_pos;
544
545 let spdx_start_offset =
546 Self::compute_spdx_offset(&line_query_tokens, &index.dictionary);
547
548 if let Some(offset) = spdx_start_offset
549 && let Some(line_first_known_pos) = line_first_known_pos
550 {
551 let (spdx_prefix, spdx_expression) = split_spdx_lid(line);
552 let spdx_text = format!("{}{}", spdx_prefix.unwrap_or_default(), spdx_expression);
553 let spdx_start_known_pos = line_first_known_pos + offset;
554
555 if spdx_start_known_pos <= line_last_known_pos.unwrap() {
556 let spdx_end = line_last_known_pos.unwrap() + 1;
557 spdx_lines.push((spdx_text, spdx_start_known_pos, spdx_end));
558 }
559 }
560 tokens_by_line.push(line_tokens);
561 }
562
563 crate::license_detection::ensure_within_deadline(deadline)?;
564
565 let high_matchables: PositionSet = tokens
566 .iter()
567 .enumerate()
568 .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Legalese)
569 .map(|(pos, _tid)| pos)
570 .collect();
571
572 let low_matchables: PositionSet = tokens
573 .iter()
574 .enumerate()
575 .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Regular)
576 .map(|(pos, _tid)| pos)
577 .collect();
578
579 let query_runs = Self::compute_query_runs(&tokens_by_line, line_threshold, has_long_lines);
580
581 Ok(Query {
582 text: text.to_string(),
583 tokens,
584 line_by_pos,
585 unknowns_by_pos,
586 stopwords_by_pos,
587 shorts_and_digits_pos,
588 high_matchables,
589 low_matchables,
590 is_binary,
591 query_run_ranges: query_runs,
592 spdx_lines,
593 index,
594 })
595 }
596
597 fn detect_binary(text: &str) -> Result<bool, anyhow::Error> {
611 let null_byte_count = text.bytes().filter(|&b| b == 0).count();
612
613 if null_byte_count > 0 {
614 return Ok(true);
615 }
616
617 let non_printable_ratio = text
618 .chars()
619 .filter(|&c| {
620 !c.is_ascii() && !c.is_ascii_graphic() && c != '\n' && c != '\r' && c != '\t'
621 })
622 .count() as f64
623 / text.len().max(1) as f64;
624
625 Ok(non_printable_ratio > 0.3)
626 }
627
628 fn detect_long_lines(text: &str) -> bool {
638 text.lines()
639 .any(|line| crate::license_detection::tokenize::count_tokens(line) > 25)
640 }
641
642 fn break_long_lines(lines: &[Vec<Option<KnownToken>>]) -> Vec<Vec<Option<KnownToken>>> {
643 lines
644 .iter()
645 .flat_map(|line| {
646 if line.is_empty() {
647 return Vec::new();
648 }
649
650 if line.len() <= Self::MAX_TOKEN_PER_LINE {
651 vec![line.clone()]
652 } else {
653 line.chunks(Self::MAX_TOKEN_PER_LINE)
654 .map(|chunk| chunk.to_vec())
655 .collect()
656 }
657 })
658 .collect()
659 }
660
661 fn compute_query_runs(
662 tokens_by_line: &[Vec<Option<KnownToken>>],
663 line_threshold: usize,
664 has_long_lines: bool,
665 ) -> Vec<(usize, Option<usize>)> {
666 let processed_lines = if has_long_lines {
667 Self::break_long_lines(tokens_by_line)
668 } else {
669 tokens_by_line.to_vec()
670 };
671
672 let mut query_runs = Vec::new();
673 let mut query_run_start = 0usize;
674 let mut query_run_end = None;
675 let mut empty_lines = 0usize;
676 let mut pos = 0usize;
677 let mut query_run_is_all_digit = true;
678
679 for line_tokens in processed_lines {
680 if query_run_end.is_some() && empty_lines >= line_threshold {
681 if !query_run_is_all_digit {
682 query_runs.push((query_run_start, query_run_end));
683 }
684 query_run_start = pos;
685 query_run_end = None;
686 empty_lines = 0;
687 query_run_is_all_digit = true;
688 }
689
690 if query_run_end.is_none() {
691 query_run_start = pos;
692 }
693
694 if line_tokens.is_empty() {
695 empty_lines += 1;
696 continue;
697 }
698
699 let line_is_all_digit = line_tokens
700 .iter()
701 .all(|token_id| token_id.map(|known| known.is_digit_only).unwrap_or(true));
702 let mut line_has_known_tokens = false;
703 let mut line_has_good_tokens = false;
704
705 for known in line_tokens.into_iter().flatten() {
706 line_has_known_tokens = true;
707 if known.kind == TokenKind::Legalese {
708 line_has_good_tokens = true;
709 }
710 if !known.is_digit_only {
711 query_run_is_all_digit = false;
712 }
713 query_run_end = Some(pos);
714 pos += 1;
715 }
716
717 if line_is_all_digit || !line_has_known_tokens {
718 empty_lines += 1;
719 continue;
720 }
721
722 if line_has_good_tokens {
723 empty_lines = 0;
724 } else {
725 empty_lines += 1;
726 }
727 }
728
729 if let Some(end) = query_run_end
730 && !query_run_is_all_digit
731 {
732 query_runs.push((query_run_start, Some(end)));
733 }
734
735 query_runs
736 }
737
738 #[inline]
748 pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
749 self.line_by_pos.get(pos).copied()
750 }
751
752 #[inline]
754 pub fn is_empty(&self) -> bool {
755 self.tokens.is_empty()
756 }
757
758 pub fn whole_query_run(&self) -> QueryRun<'a> {
762 QueryRun::whole_query_snapshot(self)
763 }
764
765 pub fn subtract(&mut self, span: &PositionSpan) {
774 self.high_matchables.remove_span(span);
775 self.low_matchables.remove_span(span);
776 }
777
778 pub fn matched_text(&self, start_line: usize, end_line: usize) -> String {
792 matched_text_from_text(&self.text, start_line, end_line)
793 }
794}
795
796#[derive(Debug, Clone)]
797struct WholeQueryRunSnapshot<'a> {
798 index: &'a LicenseIndex,
799 tokens: Vec<TokenId>,
800 line_by_pos: Vec<usize>,
801 high_matchables: PositionSet,
802 low_matchables: PositionSet,
803}
804
805#[derive(Debug, Clone)]
813pub struct QueryRun<'a> {
814 query: Option<&'a Query<'a>>,
815 whole_query_snapshot: Option<WholeQueryRunSnapshot<'a>>,
816 pub start: usize,
817 pub end: Option<usize>,
818 cached_high_matchables: OnceCell<PositionSet>,
819 cached_low_matchables: OnceCell<PositionSet>,
820 combined_matchables: RefCell<Option<PositionSet>>,
821}
822
823impl<'a> QueryRun<'a> {
824 pub fn new(query: &'a Query<'a>, start: usize, end: Option<usize>) -> Self {
833 Self {
834 query: Some(query),
835 whole_query_snapshot: None,
836 start,
837 end,
838 cached_high_matchables: OnceCell::new(),
839 cached_low_matchables: OnceCell::new(),
840 combined_matchables: RefCell::new(None),
841 }
842 }
843
844 fn whole_query_snapshot(query: &Query<'a>) -> Self {
845 let end = if query.is_empty() {
846 None
847 } else {
848 Some(query.tokens.len() - 1)
849 };
850
851 Self {
852 query: None,
853 whole_query_snapshot: Some(WholeQueryRunSnapshot {
854 index: query.index,
855 tokens: query.tokens.clone(),
856 line_by_pos: query.line_by_pos.clone(),
857 high_matchables: query.high_matchables.clone(),
858 low_matchables: query.low_matchables.clone(),
859 }),
860 start: 0,
861 end,
862 cached_high_matchables: OnceCell::new(),
863 cached_low_matchables: OnceCell::new(),
864 combined_matchables: RefCell::new(None),
865 }
866 }
867
868 fn source_tokens(&self) -> &[TokenId] {
869 if let Some(query) = self.query {
870 &query.tokens
871 } else {
872 &self
873 .whole_query_snapshot
874 .as_ref()
875 .expect("snapshot-backed whole query run should have snapshot data")
876 .tokens
877 }
878 }
879
880 fn source_line_by_pos(&self) -> &[usize] {
881 if let Some(query) = self.query {
882 &query.line_by_pos
883 } else {
884 &self
885 .whole_query_snapshot
886 .as_ref()
887 .expect("snapshot-backed whole query run should have snapshot data")
888 .line_by_pos
889 }
890 }
891
892 fn source_high_matchables(&self) -> &PositionSet {
893 if let Some(query) = self.query {
894 &query.high_matchables
895 } else {
896 &self
897 .whole_query_snapshot
898 .as_ref()
899 .expect("snapshot-backed whole query run should have snapshot data")
900 .high_matchables
901 }
902 }
903
904 fn source_low_matchables(&self) -> &PositionSet {
905 if let Some(query) = self.query {
906 &query.low_matchables
907 } else {
908 &self
909 .whole_query_snapshot
910 .as_ref()
911 .expect("snapshot-backed whole query run should have snapshot data")
912 .low_matchables
913 }
914 }
915
916 pub fn get_index(&self) -> &LicenseIndex {
918 if let Some(query) = self.query {
919 query.index
920 } else {
921 self.whole_query_snapshot
922 .as_ref()
923 .expect("snapshot-backed whole query run should have snapshot data")
924 .index
925 }
926 }
927
928 pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
936 self.source_line_by_pos().get(pos).copied()
937 }
938
939 pub fn tokens(&self) -> &[TokenId] {
945 match self.end {
946 Some(end) => &self.source_tokens()[self.start..=end],
947 None => &[],
948 }
949 }
950
951 pub fn tokens_with_pos(&self) -> impl Iterator<Item = (usize, TokenId)> + '_ {
955 self.tokens()
956 .iter()
957 .copied()
958 .enumerate()
959 .map(|(i, tid)| (self.start + i, tid))
960 }
961
962 pub fn is_digits_only(&self) -> bool {
966 self.tokens()
967 .iter()
968 .all(|&tid| self.get_index().dictionary.is_digit_only_token(tid))
969 }
970
971 pub fn is_matchable(&self, include_low: bool, exclude_positions: &[PositionSpan]) -> bool {
981 if self.is_digits_only() {
982 return false;
983 }
984
985 let matchables = self.matchables(include_low);
986
987 if exclude_positions.is_empty() {
988 return !matchables.is_empty();
989 }
990
991 let mut matchable_set = matchables;
992 for span in exclude_positions {
993 matchable_set.remove_span(span);
994 }
995
996 !matchable_set.is_empty()
997 }
998
999 pub fn matchables(&self, include_low: bool) -> PositionSet {
1000 if include_low {
1001 if let Some(ref cached) = *self.combined_matchables.borrow() {
1002 return cached.clone();
1003 }
1004 let combined = self.low_matchables().union(&self.high_matchables());
1005 *self.combined_matchables.borrow_mut() = Some(combined.clone());
1006 combined
1007 } else {
1008 self.high_matchables()
1009 }
1010 }
1011
1012 pub fn matchable_tokens(&self) -> Vec<Option<TokenId>> {
1013 let high_matchables = self.high_matchables();
1014 if high_matchables.is_empty() {
1015 return Vec::new();
1016 }
1017
1018 let matchables = self.matchables(true);
1019 self.tokens_with_pos()
1020 .map(|(pos, tid)| {
1021 if matchables.contains(pos) {
1022 Some(tid)
1023 } else {
1024 None
1025 }
1026 })
1027 .collect()
1028 }
1029
1030 pub fn high_matchables(&self) -> PositionSet {
1031 self.cached_high_matchables
1032 .get_or_init(|| {
1033 let start = self.start;
1034 let end = self.end.map(|e| e + 1).unwrap_or(usize::MAX);
1035 let source = self.source_high_matchables();
1036 let live_span = PositionSpan::new(start, end);
1037 source
1038 .iter()
1039 .filter(|&pos| live_span.contains(pos))
1040 .collect()
1041 })
1042 .clone()
1043 }
1044
1045 pub fn low_matchables(&self) -> PositionSet {
1046 self.cached_low_matchables
1047 .get_or_init(|| {
1048 let start = self.start;
1049 let end = self.end.map(|e| e + 1).unwrap_or(usize::MAX);
1050 let source = self.source_low_matchables();
1051 let live_span = PositionSpan::new(start, end);
1052 source
1053 .iter()
1054 .filter(|&pos| live_span.contains(pos))
1055 .collect()
1056 })
1057 .clone()
1058 }
1059}
1060
1061#[cfg(test)]
1062mod test;