1use crate::license_detection::index::LicenseIndex;
4use crate::license_detection::index::dictionary::{KnownToken, QueryToken, TokenId, TokenKind};
5use crate::license_detection::models::PositionSpan;
6use crate::license_detection::position_set::PositionSet;
7use crate::license_detection::spdx_lid::split_spdx_lid;
8use crate::license_detection::tokenize::STOPWORDS;
9use crate::license_detection::tokenize::tokenize_as_ids;
10use regex::Regex;
11use std::cell::{OnceCell, RefCell};
12use std::collections::HashMap;
13use std::sync::LazyLock;
14use std::time::Instant;
15
16static QUERY_PATTERN: LazyLock<Regex> =
17 LazyLock::new(|| Regex::new(r"[^_\W]+\+?[^_\W]*").expect("valid query regex"));
18static MATCHED_TEXT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
19 Regex::new(r"(?P<token>[^_\W]+\+?[^_\W]*)|(?P<punct>[_\W\s\+]+[_\W\s]?)")
20 .expect("valid matched text regex")
21});
22
23#[derive(Clone)]
24struct MatchedTextToken {
25 value: String,
26 line_num: usize,
27 pos: Option<usize>,
28 is_text: bool,
29 is_matched: bool,
30}
31
32#[derive(Debug)]
44pub struct Query<'a> {
45 pub text: String,
49
50 pub tokens: Vec<TokenId>,
54
55 pub line_by_pos: Vec<usize>,
62
63 pub unknowns_by_pos: HashMap<Option<usize>, usize>,
71
72 pub stopwords_by_pos: HashMap<Option<usize>, usize>,
78
79 pub shorts_and_digits_pos: PositionSet,
85
86 pub high_matchables: PositionSet,
92
93 pub low_matchables: PositionSet,
99
100 pub is_binary: bool,
104
105 pub(crate) query_run_ranges: Vec<(usize, Option<usize>)>,
111
112 pub spdx_lines: Vec<(String, usize, usize)>,
119
120 pub index: &'a LicenseIndex,
122}
123
124pub fn matched_text_from_text(text: &str, start_line: usize, end_line: usize) -> String {
125 if start_line == 0 || end_line == 0 || start_line > end_line {
126 return String::new();
127 }
128
129 text.lines()
130 .enumerate()
131 .filter_map(|(idx, line)| {
132 let line_num = idx + 1;
133 if line_num >= start_line && line_num <= end_line {
134 Some(line)
135 } else {
136 None
137 }
138 })
139 .collect::<Vec<_>>()
140 .join("\n")
141}
142
143pub fn matched_text_diagnostics_from_text(
144 text: &str,
145 query: &Query<'_>,
146 matched_positions: &PositionSet,
147 start_pos: usize,
148 end_pos: usize,
149 start_line: usize,
150 end_line: usize,
151) -> String {
152 let tokens = tokenize_matched_text(text, query);
153 let reportable_tokens = collect_reportable_tokens(
154 tokens,
155 matched_positions,
156 start_pos,
157 end_pos,
158 start_line,
159 end_line,
160 );
161 let line_endings = collect_line_endings(text);
162
163 render_diagnostic_tokens(&reportable_tokens, &line_endings)
164}
165
166fn tokenize_matched_text(text: &str, query: &Query<'_>) -> Vec<MatchedTextToken> {
167 let mut tokens = Vec::new();
168 let mut pos = 0usize;
169 let mut line_num = 1usize;
170
171 for line in text.split_inclusive('\n') {
172 for capture in MATCHED_TEXT_PATTERN.captures_iter(line) {
173 if let Some(token_match) = capture.name("token") {
174 let token_text = token_match.as_str();
175 let retokenized: Vec<String> = QUERY_PATTERN
176 .find_iter(&token_text.to_lowercase())
177 .map(|m| m.as_str().to_string())
178 .filter(|token| !STOPWORDS.contains(token.as_str()))
179 .collect();
180
181 if retokenized.is_empty() {
182 tokens.push(MatchedTextToken {
183 value: token_text.to_string(),
184 line_num,
185 pos: None,
186 is_text: true,
187 is_matched: false,
188 });
189 } else if retokenized.len() == 1 {
190 let token = &retokenized[0];
191 let token_pos = if query.index.dictionary.get(token).is_some() {
192 let current_pos = pos;
193 pos += 1;
194 Some(current_pos)
195 } else {
196 None
197 };
198
199 tokens.push(MatchedTextToken {
200 value: token_text.to_string(),
201 line_num,
202 pos: token_pos,
203 is_text: true,
204 is_matched: false,
205 });
206 } else {
207 for token in retokenized {
208 let token_pos = if query.index.dictionary.get(&token).is_some() {
209 let current_pos = pos;
210 pos += 1;
211 Some(current_pos)
212 } else {
213 None
214 };
215
216 tokens.push(MatchedTextToken {
217 value: token,
218 line_num,
219 pos: token_pos,
220 is_text: true,
221 is_matched: false,
222 });
223 }
224 }
225 } else if let Some(punct_match) = capture.name("punct") {
226 tokens.push(MatchedTextToken {
227 value: punct_match.as_str().to_string(),
228 line_num,
229 pos: None,
230 is_text: false,
231 is_matched: false,
232 });
233 }
234 }
235
236 line_num += 1;
237 }
238
239 tokens
240}
241
242fn collect_reportable_tokens(
243 tokens: Vec<MatchedTextToken>,
244 matched_positions: &PositionSet,
245 start_pos: usize,
246 end_pos: usize,
247 start_line: usize,
248 end_line: usize,
249) -> Vec<MatchedTextToken> {
250 let mut reportable = Vec::new();
251 let mut started = false;
252 let mut finished = false;
253 let mut end_real_pos = None;
254 let mut last_real_pos = None;
255
256 for (real_pos, mut token) in tokens.into_iter().enumerate() {
257 if token.line_num < start_line {
258 continue;
259 }
260
261 if token.line_num > end_line {
262 break;
263 }
264
265 let mut is_included = false;
266
267 if token.pos.is_some_and(|pos| matched_positions.contains(pos)) {
268 token.is_matched = true;
269 is_included = true;
270 }
271
272 if !started && token.pos == Some(start_pos) {
273 started = true;
274 is_included = true;
275 }
276
277 if started && !finished {
278 is_included = true;
279 }
280
281 if token.pos == Some(end_pos) {
282 finished = true;
283 started = false;
284 end_real_pos = Some(real_pos);
285 }
286
287 if finished && !started && end_real_pos.is_some() && last_real_pos == end_real_pos {
288 end_real_pos = None;
289 if !token.is_text && !token.value.trim().is_empty() {
290 is_included = true;
291 }
292 }
293
294 last_real_pos = Some(real_pos);
295
296 if is_included {
297 reportable.push(token);
298 }
299 }
300
301 reportable
302}
303
304fn collect_line_endings(text: &str) -> Vec<String> {
305 text.split_inclusive('\n')
306 .map(|line| {
307 if line.ends_with("\r\n") {
308 "\r\n".to_string()
309 } else if line.ends_with('\n') {
310 "\n".to_string()
311 } else {
312 String::new()
313 }
314 })
315 .collect()
316}
317
318fn render_diagnostic_tokens(tokens: &[MatchedTextToken], line_endings: &[String]) -> String {
319 let mut rendered = String::new();
320 let mut previous_line: Option<usize> = None;
321
322 for token in tokens {
323 if let Some(prev_line) = previous_line
324 && token.line_num > prev_line
325 {
326 for line in prev_line..token.line_num {
327 if let Some(line_ending) = line_endings.get(line.saturating_sub(1)) {
328 rendered.push_str(line_ending.as_str());
329 }
330 }
331 }
332
333 let token_value = if token.is_text {
334 token.value.as_str()
335 } else {
336 token
337 .value
338 .strip_suffix("\r\n")
339 .or_else(|| token.value.strip_suffix('\n'))
340 .unwrap_or(token.value.as_str())
341 };
342
343 if token.is_text && !STOPWORDS.contains(token.value.to_lowercase().as_str()) {
344 if token.is_matched {
345 rendered.push_str(token_value);
346 } else {
347 rendered.push('[');
348 rendered.push_str(token_value);
349 rendered.push(']');
350 }
351 } else {
352 rendered.push_str(token_value);
353 }
354
355 previous_line = Some(token.line_num);
356 }
357
358 rendered
359}
360
361impl<'a> Query<'a> {
362 const TEXT_LINE_THRESHOLD: usize = 15;
377 const BINARY_LINE_THRESHOLD: usize = 50;
378 const MAX_TOKEN_PER_LINE: usize = 25;
379
380 fn compute_spdx_offset(
381 tokens: &[QueryToken],
382 dictionary: &crate::license_detection::index::dictionary::TokenDictionary,
383 ) -> Option<usize> {
384 let get_known_id = |i: usize| -> Option<TokenId> {
385 match tokens.get(i)? {
386 QueryToken::Known(known) => Some(known.id),
387 _ => None,
388 }
389 };
390
391 let spdx_id = dictionary.get("spdx")?;
392 let license_id = dictionary.get("license")?;
393 let identifier_id = dictionary.get("identifier")?;
394 let licence_id = dictionary.get("licence");
395
396 let licenses_id = dictionary.get("licenses");
397 let nuget_id = dictionary.get("nuget");
398 let org_id = dictionary.get("org");
399
400 let is_spdx_prefix = |ids: [Option<TokenId>; 3]| -> bool {
401 ids.iter().all(|id| id.is_some())
402 && ids[0] == Some(spdx_id)
403 && (ids[1] == Some(license_id) || ids[1] == licence_id)
404 && ids[2] == Some(identifier_id)
405 };
406
407 let is_nuget_prefix = |ids: [Option<TokenId>; 3]| -> bool {
408 licenses_id.is_some()
409 && nuget_id.is_some()
410 && org_id.is_some()
411 && ids[0] == licenses_id
412 && ids[1] == Some(nuget_id.unwrap())
413 && ids[2] == Some(org_id.unwrap())
414 };
415
416 if tokens.len() >= 3 {
417 let first_three = [get_known_id(0), get_known_id(1), get_known_id(2)];
418 if is_spdx_prefix(first_three) || is_nuget_prefix(first_three) {
419 return Some(0);
420 }
421 }
422
423 if tokens.len() >= 4 {
424 let second_three = [get_known_id(1), get_known_id(2), get_known_id(3)];
425 if is_spdx_prefix(second_three) || is_nuget_prefix(second_three) {
426 return Some(1);
427 }
428 }
429
430 if tokens.len() >= 5 {
431 let third_three = [get_known_id(2), get_known_id(3), get_known_id(4)];
432 if is_spdx_prefix(third_three) || is_nuget_prefix(third_three) {
433 return Some(2);
434 }
435 }
436
437 None
438 }
439
440 pub fn from_extracted_text(
441 text: &str,
442 index: &'a LicenseIndex,
443 binary_derived: bool,
444 ) -> Result<Self, anyhow::Error> {
445 Self::from_extracted_text_with_deadline(text, index, binary_derived, None)
446 }
447
448 pub fn from_extracted_text_with_deadline(
449 text: &str,
450 index: &'a LicenseIndex,
451 binary_derived: bool,
452 deadline: Option<Instant>,
453 ) -> Result<Self, anyhow::Error> {
454 let line_threshold = if binary_derived {
455 Self::BINARY_LINE_THRESHOLD
456 } else {
457 Self::TEXT_LINE_THRESHOLD
458 };
459
460 Self::with_source_options(text, index, line_threshold, Some(binary_derived), deadline)
461 }
462
463 pub fn query_runs(&self) -> Vec<QueryRun<'_>> {
467 self.query_run_ranges
468 .iter()
469 .map(|&(start, end)| QueryRun::new(self, start, end))
470 .collect()
471 }
472
473 fn with_source_options(
474 text: &str,
475 index: &'a LicenseIndex,
476 line_threshold: usize,
477 binary_derived: Option<bool>,
478 deadline: Option<Instant>,
479 ) -> Result<Self, anyhow::Error> {
480 crate::license_detection::ensure_within_deadline(deadline)?;
481 let is_binary = match binary_derived {
482 Some(is_binary) => is_binary,
483 None => Self::detect_binary(text)?,
484 };
485 let has_long_lines = Self::detect_long_lines(text);
486
487 let mut tokens = Vec::new();
488 let mut line_by_pos = Vec::new();
489 let mut unknowns_by_pos: HashMap<Option<usize>, usize> = HashMap::new();
490 let mut stopwords_by_pos: HashMap<Option<usize>, usize> = HashMap::new();
491 let mut shorts_and_digits_pos = PositionSet::new();
492 let mut spdx_lines: Vec<(String, usize, usize)> = Vec::new();
493
494 let mut known_pos: Option<usize> = None;
495 let mut started = false;
496 let mut current_line = 1usize;
497
498 let mut tokens_by_line: Vec<Vec<Option<KnownToken>>> = Vec::new();
499
500 for (line_index, line) in text.lines().enumerate() {
501 if line_index.is_multiple_of(128) {
502 crate::license_detection::ensure_within_deadline(deadline)?;
503 }
504
505 let line_trimmed = line.trim();
506 let mut line_tokens: Vec<Option<KnownToken>> = Vec::new();
507
508 let mut line_first_known_pos = None;
509
510 let line_query_tokens = tokenize_as_ids(line_trimmed, &index.dictionary);
511
512 for query_token in &line_query_tokens {
513 match query_token {
514 QueryToken::Known(known_token) => {
515 known_pos = Some(known_pos.map_or(0, |p| p + 1));
516 started = true;
517 tokens.push(known_token.id);
518 line_by_pos.push(current_line);
519 line_tokens.push(Some(*known_token));
520
521 if line_first_known_pos.is_none() {
522 line_first_known_pos = known_pos;
523 }
524
525 if known_token.is_short_or_digit {
526 let _ = shorts_and_digits_pos.insert(known_pos.unwrap());
527 }
528 }
529 QueryToken::Unknown if !started => {
530 *unknowns_by_pos.entry(None).or_insert(0) += 1;
531 line_tokens.push(None);
532 }
533 QueryToken::Unknown => {
534 *unknowns_by_pos.entry(known_pos).or_insert(0) += 1;
535 line_tokens.push(None);
536 }
537 QueryToken::Stopword if !started => {
538 *stopwords_by_pos.entry(None).or_insert(0) += 1;
539 }
540 QueryToken::Stopword => {
541 *stopwords_by_pos.entry(known_pos).or_insert(0) += 1;
542 }
543 }
544 }
545
546 let line_last_known_pos = known_pos;
547
548 let spdx_start_offset =
549 Self::compute_spdx_offset(&line_query_tokens, &index.dictionary);
550
551 if let Some(offset) = spdx_start_offset
552 && let Some(line_first_known_pos) = line_first_known_pos
553 {
554 let (spdx_prefix, spdx_expression) = split_spdx_lid(line);
555 let spdx_text = format!("{}{}", spdx_prefix.unwrap_or_default(), spdx_expression);
556 let spdx_start_known_pos = line_first_known_pos + offset;
557
558 if spdx_start_known_pos <= line_last_known_pos.unwrap() {
559 let spdx_end = line_last_known_pos.unwrap() + 1;
560 spdx_lines.push((spdx_text, spdx_start_known_pos, spdx_end));
561 }
562 }
563
564 tokens_by_line.push(line_tokens);
565 current_line += 1;
566 }
567
568 crate::license_detection::ensure_within_deadline(deadline)?;
569
570 let high_matchables: PositionSet = tokens
571 .iter()
572 .enumerate()
573 .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Legalese)
574 .map(|(pos, _tid)| pos)
575 .collect();
576
577 let low_matchables: PositionSet = tokens
578 .iter()
579 .enumerate()
580 .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Regular)
581 .map(|(pos, _tid)| pos)
582 .collect();
583
584 let query_runs = Self::compute_query_runs(&tokens_by_line, line_threshold, has_long_lines);
585
586 Ok(Query {
587 text: text.to_string(),
588 tokens,
589 line_by_pos,
590 unknowns_by_pos,
591 stopwords_by_pos,
592 shorts_and_digits_pos,
593 high_matchables,
594 low_matchables,
595 is_binary,
596 query_run_ranges: query_runs,
597 spdx_lines,
598 index,
599 })
600 }
601
602 fn detect_binary(text: &str) -> Result<bool, anyhow::Error> {
616 let null_byte_count = text.bytes().filter(|&b| b == 0).count();
617
618 if null_byte_count > 0 {
619 return Ok(true);
620 }
621
622 let non_printable_ratio = text
623 .chars()
624 .filter(|&c| {
625 !c.is_ascii() && !c.is_ascii_graphic() && c != '\n' && c != '\r' && c != '\t'
626 })
627 .count() as f64
628 / text.len().max(1) as f64;
629
630 Ok(non_printable_ratio > 0.3)
631 }
632
633 fn detect_long_lines(text: &str) -> bool {
643 text.lines()
644 .any(|line| crate::license_detection::tokenize::count_tokens(line) > 25)
645 }
646
647 fn break_long_lines(lines: &[Vec<Option<KnownToken>>]) -> Vec<Vec<Option<KnownToken>>> {
648 lines
649 .iter()
650 .flat_map(|line| {
651 if line.is_empty() {
652 return Vec::new();
653 }
654
655 if line.len() <= Self::MAX_TOKEN_PER_LINE {
656 vec![line.clone()]
657 } else {
658 line.chunks(Self::MAX_TOKEN_PER_LINE)
659 .map(|chunk| chunk.to_vec())
660 .collect()
661 }
662 })
663 .collect()
664 }
665
666 fn compute_query_runs(
667 tokens_by_line: &[Vec<Option<KnownToken>>],
668 line_threshold: usize,
669 has_long_lines: bool,
670 ) -> Vec<(usize, Option<usize>)> {
671 let processed_lines = if has_long_lines {
672 Self::break_long_lines(tokens_by_line)
673 } else {
674 tokens_by_line.to_vec()
675 };
676
677 let mut query_runs = Vec::new();
678 let mut query_run_start = 0usize;
679 let mut query_run_end = None;
680 let mut empty_lines = 0usize;
681 let mut pos = 0usize;
682 let mut query_run_is_all_digit = true;
683
684 for line_tokens in processed_lines {
685 if query_run_end.is_some() && empty_lines >= line_threshold {
686 if !query_run_is_all_digit {
687 query_runs.push((query_run_start, query_run_end));
688 }
689 query_run_start = pos;
690 query_run_end = None;
691 empty_lines = 0;
692 query_run_is_all_digit = true;
693 }
694
695 if query_run_end.is_none() {
696 query_run_start = pos;
697 }
698
699 if line_tokens.is_empty() {
700 empty_lines += 1;
701 continue;
702 }
703
704 let line_is_all_digit = line_tokens
705 .iter()
706 .all(|token_id| token_id.map(|known| known.is_digit_only).unwrap_or(true));
707 let mut line_has_known_tokens = false;
708 let mut line_has_good_tokens = false;
709
710 for known in line_tokens.into_iter().flatten() {
711 line_has_known_tokens = true;
712 if known.kind == TokenKind::Legalese {
713 line_has_good_tokens = true;
714 }
715 if !known.is_digit_only {
716 query_run_is_all_digit = false;
717 }
718 query_run_end = Some(pos);
719 pos += 1;
720 }
721
722 if line_is_all_digit || !line_has_known_tokens {
723 empty_lines += 1;
724 continue;
725 }
726
727 if line_has_good_tokens {
728 empty_lines = 0;
729 } else {
730 empty_lines += 1;
731 }
732 }
733
734 if let Some(end) = query_run_end
735 && !query_run_is_all_digit
736 {
737 query_runs.push((query_run_start, Some(end)));
738 }
739
740 query_runs
741 }
742
743 #[inline]
753 pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
754 self.line_by_pos.get(pos).copied()
755 }
756
757 #[inline]
759 pub fn is_empty(&self) -> bool {
760 self.tokens.is_empty()
761 }
762
763 pub fn whole_query_run(&self) -> QueryRun<'a> {
767 QueryRun::whole_query_snapshot(self)
768 }
769
770 pub fn subtract(&mut self, span: &PositionSpan) {
779 self.high_matchables.remove_span(span);
780 self.low_matchables.remove_span(span);
781 }
782
783 pub fn matched_text(&self, start_line: usize, end_line: usize) -> String {
797 matched_text_from_text(&self.text, start_line, end_line)
798 }
799}
800
801#[derive(Debug, Clone)]
802struct WholeQueryRunSnapshot<'a> {
803 index: &'a LicenseIndex,
804 tokens: Vec<TokenId>,
805 line_by_pos: Vec<usize>,
806 high_matchables: PositionSet,
807 low_matchables: PositionSet,
808}
809
810#[derive(Debug, Clone)]
818pub struct QueryRun<'a> {
819 query: Option<&'a Query<'a>>,
820 whole_query_snapshot: Option<WholeQueryRunSnapshot<'a>>,
821 pub start: usize,
822 pub end: Option<usize>,
823 cached_high_matchables: OnceCell<PositionSet>,
824 cached_low_matchables: OnceCell<PositionSet>,
825 combined_matchables: RefCell<Option<PositionSet>>,
826}
827
828impl<'a> QueryRun<'a> {
829 pub fn new(query: &'a Query<'a>, start: usize, end: Option<usize>) -> Self {
838 Self {
839 query: Some(query),
840 whole_query_snapshot: None,
841 start,
842 end,
843 cached_high_matchables: OnceCell::new(),
844 cached_low_matchables: OnceCell::new(),
845 combined_matchables: RefCell::new(None),
846 }
847 }
848
849 fn whole_query_snapshot(query: &Query<'a>) -> Self {
850 let end = if query.is_empty() {
851 None
852 } else {
853 Some(query.tokens.len() - 1)
854 };
855
856 Self {
857 query: None,
858 whole_query_snapshot: Some(WholeQueryRunSnapshot {
859 index: query.index,
860 tokens: query.tokens.clone(),
861 line_by_pos: query.line_by_pos.clone(),
862 high_matchables: query.high_matchables.clone(),
863 low_matchables: query.low_matchables.clone(),
864 }),
865 start: 0,
866 end,
867 cached_high_matchables: OnceCell::new(),
868 cached_low_matchables: OnceCell::new(),
869 combined_matchables: RefCell::new(None),
870 }
871 }
872
873 fn source_tokens(&self) -> &[TokenId] {
874 if let Some(query) = self.query {
875 &query.tokens
876 } else {
877 &self
878 .whole_query_snapshot
879 .as_ref()
880 .expect("snapshot-backed whole query run should have snapshot data")
881 .tokens
882 }
883 }
884
885 fn source_line_by_pos(&self) -> &[usize] {
886 if let Some(query) = self.query {
887 &query.line_by_pos
888 } else {
889 &self
890 .whole_query_snapshot
891 .as_ref()
892 .expect("snapshot-backed whole query run should have snapshot data")
893 .line_by_pos
894 }
895 }
896
897 fn source_high_matchables(&self) -> &PositionSet {
898 if let Some(query) = self.query {
899 &query.high_matchables
900 } else {
901 &self
902 .whole_query_snapshot
903 .as_ref()
904 .expect("snapshot-backed whole query run should have snapshot data")
905 .high_matchables
906 }
907 }
908
909 fn source_low_matchables(&self) -> &PositionSet {
910 if let Some(query) = self.query {
911 &query.low_matchables
912 } else {
913 &self
914 .whole_query_snapshot
915 .as_ref()
916 .expect("snapshot-backed whole query run should have snapshot data")
917 .low_matchables
918 }
919 }
920
921 pub fn get_index(&self) -> &LicenseIndex {
923 if let Some(query) = self.query {
924 query.index
925 } else {
926 self.whole_query_snapshot
927 .as_ref()
928 .expect("snapshot-backed whole query run should have snapshot data")
929 .index
930 }
931 }
932
933 pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
941 self.source_line_by_pos().get(pos).copied()
942 }
943
944 pub fn tokens(&self) -> &[TokenId] {
950 match self.end {
951 Some(end) => &self.source_tokens()[self.start..=end],
952 None => &[],
953 }
954 }
955
956 pub fn tokens_with_pos(&self) -> impl Iterator<Item = (usize, TokenId)> + '_ {
960 self.tokens()
961 .iter()
962 .copied()
963 .enumerate()
964 .map(|(i, tid)| (self.start + i, tid))
965 }
966
967 pub fn is_digits_only(&self) -> bool {
971 self.tokens()
972 .iter()
973 .all(|&tid| self.get_index().dictionary.is_digit_only_token(tid))
974 }
975
976 pub fn is_matchable(&self, include_low: bool, exclude_positions: &[PositionSpan]) -> bool {
986 if self.is_digits_only() {
987 return false;
988 }
989
990 let matchables = self.matchables(include_low);
991
992 if exclude_positions.is_empty() {
993 return !matchables.is_empty();
994 }
995
996 let mut matchable_set = matchables;
997 for span in exclude_positions {
998 matchable_set.remove_span(span);
999 }
1000
1001 !matchable_set.is_empty()
1002 }
1003
1004 pub fn matchables(&self, include_low: bool) -> PositionSet {
1005 if include_low {
1006 if let Some(ref cached) = *self.combined_matchables.borrow() {
1007 return cached.clone();
1008 }
1009 let combined = self.low_matchables().union(&self.high_matchables());
1010 *self.combined_matchables.borrow_mut() = Some(combined.clone());
1011 combined
1012 } else {
1013 self.high_matchables()
1014 }
1015 }
1016
1017 pub fn matchable_tokens(&self) -> Vec<Option<TokenId>> {
1018 let high_matchables = self.high_matchables();
1019 if high_matchables.is_empty() {
1020 return Vec::new();
1021 }
1022
1023 let matchables = self.matchables(true);
1024 self.tokens_with_pos()
1025 .map(|(pos, tid)| {
1026 if matchables.contains(pos) {
1027 Some(tid)
1028 } else {
1029 None
1030 }
1031 })
1032 .collect()
1033 }
1034
1035 pub fn high_matchables(&self) -> PositionSet {
1036 self.cached_high_matchables
1037 .get_or_init(|| {
1038 let start = self.start;
1039 let end = self.end.map(|e| e + 1).unwrap_or(usize::MAX);
1040 let source = self.source_high_matchables();
1041 let live_span = PositionSpan::new(start, end);
1042 source
1043 .iter()
1044 .filter(|&pos| live_span.contains(pos))
1045 .collect()
1046 })
1047 .clone()
1048 }
1049
1050 pub fn low_matchables(&self) -> PositionSet {
1051 self.cached_low_matchables
1052 .get_or_init(|| {
1053 let start = self.start;
1054 let end = self.end.map(|e| e + 1).unwrap_or(usize::MAX);
1055 let source = self.source_low_matchables();
1056 let live_span = PositionSpan::new(start, end);
1057 source
1058 .iter()
1059 .filter(|&pos| live_span.contains(pos))
1060 .collect()
1061 })
1062 .clone()
1063 }
1064}
1065
1066#[cfg(test)]
1067mod test;