1use crate::license_detection::index::LicenseIndex;
7use crate::license_detection::index::dictionary::{KnownToken, QueryToken, TokenId, TokenKind};
8use crate::license_detection::models::PositionSpan;
9use crate::license_detection::position_set::PositionSet;
10use crate::license_detection::spdx_lid::split_spdx_lid;
11use crate::license_detection::tokenize::STOPWORDS;
12use crate::license_detection::tokenize::tokenize_as_ids;
13use regex::Regex;
14use std::cell::{OnceCell, RefCell};
15use std::collections::HashMap;
16use std::sync::LazyLock;
17use std::time::Instant;
18
19static QUERY_PATTERN: LazyLock<Regex> =
20 LazyLock::new(|| Regex::new(r"[^_\W]+\+?[^_\W]*").expect("valid query regex"));
21static MATCHED_TEXT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
22 Regex::new(r"(?P<token>[^_\W]+\+?[^_\W]*)|(?P<punct>[_\W\s\+]+[_\W\s]?)")
23 .expect("valid matched text regex")
24});
25
26#[derive(Clone)]
27struct MatchedTextToken {
28 value: String,
29 line_num: usize,
30 pos: Option<usize>,
31 is_text: bool,
32 is_matched: bool,
33}
34
35#[derive(Debug)]
47pub struct Query<'a> {
48 pub text: String,
52
53 pub tokens: Vec<TokenId>,
57
58 pub line_by_pos: Vec<usize>,
65
66 pub unknowns_by_pos: HashMap<Option<usize>, usize>,
74
75 pub stopwords_by_pos: HashMap<Option<usize>, usize>,
81
82 pub shorts_and_digits_pos: PositionSet,
88
89 pub high_matchables: PositionSet,
95
96 pub low_matchables: PositionSet,
102
103 pub is_binary: bool,
107
108 pub(crate) query_run_ranges: Vec<(usize, Option<usize>)>,
114
115 pub spdx_lines: Vec<(String, usize, usize)>,
122
123 pub index: &'a LicenseIndex,
125}
126
127pub fn matched_text_from_text(text: &str, start_line: usize, end_line: usize) -> String {
128 if start_line == 0 || end_line == 0 || start_line > end_line {
129 return String::new();
130 }
131
132 text.lines()
133 .enumerate()
134 .filter_map(|(idx, line)| {
135 let line_num = idx + 1;
136 if line_num >= start_line && line_num <= end_line {
137 Some(line)
138 } else {
139 None
140 }
141 })
142 .collect::<Vec<_>>()
143 .join("\n")
144}
145
146pub fn matched_text_diagnostics_from_text(
147 text: &str,
148 query: &Query<'_>,
149 matched_positions: &PositionSet,
150 start_pos: usize,
151 end_pos: usize,
152 start_line: usize,
153 end_line: usize,
154) -> String {
155 let tokens = tokenize_matched_text(text, query);
156 let reportable_tokens = collect_reportable_tokens(
157 tokens,
158 matched_positions,
159 start_pos,
160 end_pos,
161 start_line,
162 end_line,
163 );
164 let line_endings = collect_line_endings(text);
165
166 render_diagnostic_tokens(&reportable_tokens, &line_endings)
167}
168
169fn tokenize_matched_text(text: &str, query: &Query<'_>) -> Vec<MatchedTextToken> {
170 let mut tokens = Vec::new();
171 let mut pos = 0usize;
172 let mut line_num = 1usize;
173
174 for line in text.split_inclusive('\n') {
175 for capture in MATCHED_TEXT_PATTERN.captures_iter(line) {
176 if let Some(token_match) = capture.name("token") {
177 let token_text = token_match.as_str();
178 let retokenized: Vec<String> = QUERY_PATTERN
179 .find_iter(&token_text.to_lowercase())
180 .map(|m| m.as_str().to_string())
181 .filter(|token| !STOPWORDS.contains(token.as_str()))
182 .collect();
183
184 if retokenized.is_empty() {
185 tokens.push(MatchedTextToken {
186 value: token_text.to_string(),
187 line_num,
188 pos: None,
189 is_text: true,
190 is_matched: false,
191 });
192 } else if retokenized.len() == 1 {
193 let token = &retokenized[0];
194 let token_pos = if query.index.dictionary.get(token).is_some() {
195 let current_pos = pos;
196 pos += 1;
197 Some(current_pos)
198 } else {
199 None
200 };
201
202 tokens.push(MatchedTextToken {
203 value: token_text.to_string(),
204 line_num,
205 pos: token_pos,
206 is_text: true,
207 is_matched: false,
208 });
209 } else {
210 for token in retokenized {
211 let token_pos = if query.index.dictionary.get(&token).is_some() {
212 let current_pos = pos;
213 pos += 1;
214 Some(current_pos)
215 } else {
216 None
217 };
218
219 tokens.push(MatchedTextToken {
220 value: token,
221 line_num,
222 pos: token_pos,
223 is_text: true,
224 is_matched: false,
225 });
226 }
227 }
228 } else if let Some(punct_match) = capture.name("punct") {
229 tokens.push(MatchedTextToken {
230 value: punct_match.as_str().to_string(),
231 line_num,
232 pos: None,
233 is_text: false,
234 is_matched: false,
235 });
236 }
237 }
238
239 line_num += 1;
240 }
241
242 tokens
243}
244
245fn collect_reportable_tokens(
246 tokens: Vec<MatchedTextToken>,
247 matched_positions: &PositionSet,
248 start_pos: usize,
249 end_pos: usize,
250 start_line: usize,
251 end_line: usize,
252) -> Vec<MatchedTextToken> {
253 let mut reportable = Vec::new();
254 let mut started = false;
255 let mut finished = false;
256 let mut end_real_pos = None;
257 let mut last_real_pos = None;
258
259 for (real_pos, mut token) in tokens.into_iter().enumerate() {
260 if token.line_num < start_line {
261 continue;
262 }
263
264 if token.line_num > end_line {
265 break;
266 }
267
268 let mut is_included = false;
269
270 if token.pos.is_some_and(|pos| matched_positions.contains(pos)) {
271 token.is_matched = true;
272 is_included = true;
273 }
274
275 if !started && token.pos == Some(start_pos) {
276 started = true;
277 is_included = true;
278 }
279
280 if started && !finished {
281 is_included = true;
282 }
283
284 if token.pos == Some(end_pos) {
285 finished = true;
286 started = false;
287 end_real_pos = Some(real_pos);
288 }
289
290 if finished && !started && end_real_pos.is_some() && last_real_pos == end_real_pos {
291 end_real_pos = None;
292 if !token.is_text && !token.value.trim().is_empty() {
293 is_included = true;
294 }
295 }
296
297 last_real_pos = Some(real_pos);
298
299 if is_included {
300 reportable.push(token);
301 }
302 }
303
304 reportable
305}
306
307fn collect_line_endings(text: &str) -> Vec<String> {
308 text.split_inclusive('\n')
309 .map(|line| {
310 if line.ends_with("\r\n") {
311 "\r\n".to_string()
312 } else if line.ends_with('\n') {
313 "\n".to_string()
314 } else {
315 String::new()
316 }
317 })
318 .collect()
319}
320
321fn render_diagnostic_tokens(tokens: &[MatchedTextToken], line_endings: &[String]) -> String {
322 let mut rendered = String::new();
323 let mut previous_line: Option<usize> = None;
324
325 for token in tokens {
326 if let Some(prev_line) = previous_line
327 && token.line_num > prev_line
328 {
329 for line in prev_line..token.line_num {
330 if let Some(line_ending) = line_endings.get(line.saturating_sub(1)) {
331 rendered.push_str(line_ending.as_str());
332 }
333 }
334 }
335
336 let token_value = if token.is_text {
337 token.value.as_str()
338 } else {
339 token
340 .value
341 .strip_suffix("\r\n")
342 .or_else(|| token.value.strip_suffix('\n'))
343 .unwrap_or(token.value.as_str())
344 };
345
346 if token.is_text && !STOPWORDS.contains(token.value.to_lowercase().as_str()) {
347 if token.is_matched {
348 rendered.push_str(token_value);
349 } else {
350 rendered.push('[');
351 rendered.push_str(token_value);
352 rendered.push(']');
353 }
354 } else {
355 rendered.push_str(token_value);
356 }
357
358 previous_line = Some(token.line_num);
359 }
360
361 rendered
362}
363
364impl<'a> Query<'a> {
365 const TEXT_LINE_THRESHOLD: usize = 15;
380 const BINARY_LINE_THRESHOLD: usize = 50;
381 const MAX_TOKEN_PER_LINE: usize = 25;
382
383 fn compute_spdx_offset(
384 tokens: &[QueryToken],
385 dictionary: &crate::license_detection::index::dictionary::TokenDictionary,
386 ) -> Option<usize> {
387 let get_known_id = |i: usize| -> Option<TokenId> {
388 match tokens.get(i)? {
389 QueryToken::Known(known) => Some(known.id),
390 _ => None,
391 }
392 };
393
394 let spdx_id = dictionary.get("spdx")?;
395 let license_id = dictionary.get("license")?;
396 let identifier_id = dictionary.get("identifier")?;
397 let licence_id = dictionary.get("licence");
398
399 let licenses_id = dictionary.get("licenses");
400 let nuget_id = dictionary.get("nuget");
401 let org_id = dictionary.get("org");
402
403 let is_spdx_prefix = |ids: [Option<TokenId>; 3]| -> bool {
404 ids.iter().all(|id| id.is_some())
405 && ids[0] == Some(spdx_id)
406 && (ids[1] == Some(license_id) || ids[1] == licence_id)
407 && ids[2] == Some(identifier_id)
408 };
409
410 let is_nuget_prefix = |ids: [Option<TokenId>; 3]| -> bool {
411 licenses_id.is_some()
412 && nuget_id.is_some()
413 && org_id.is_some()
414 && ids[0] == licenses_id
415 && ids[1] == Some(nuget_id.unwrap())
416 && ids[2] == Some(org_id.unwrap())
417 };
418
419 if tokens.len() >= 3 {
420 let first_three = [get_known_id(0), get_known_id(1), get_known_id(2)];
421 if is_spdx_prefix(first_three) || is_nuget_prefix(first_three) {
422 return Some(0);
423 }
424 }
425
426 if tokens.len() >= 4 {
427 let second_three = [get_known_id(1), get_known_id(2), get_known_id(3)];
428 if is_spdx_prefix(second_three) || is_nuget_prefix(second_three) {
429 return Some(1);
430 }
431 }
432
433 if tokens.len() >= 5 {
434 let third_three = [get_known_id(2), get_known_id(3), get_known_id(4)];
435 if is_spdx_prefix(third_three) || is_nuget_prefix(third_three) {
436 return Some(2);
437 }
438 }
439
440 None
441 }
442
443 pub fn from_extracted_text(
444 text: &str,
445 index: &'a LicenseIndex,
446 binary_derived: bool,
447 ) -> Result<Self, anyhow::Error> {
448 Self::from_extracted_text_with_deadline(text, index, binary_derived, None)
449 }
450
451 pub fn from_extracted_text_with_deadline(
452 text: &str,
453 index: &'a LicenseIndex,
454 binary_derived: bool,
455 deadline: Option<Instant>,
456 ) -> Result<Self, anyhow::Error> {
457 let line_threshold = if binary_derived {
458 Self::BINARY_LINE_THRESHOLD
459 } else {
460 Self::TEXT_LINE_THRESHOLD
461 };
462
463 Self::with_source_options(text, index, line_threshold, Some(binary_derived), deadline)
464 }
465
466 pub fn query_runs(&self) -> Vec<QueryRun<'_>> {
470 self.query_run_ranges
471 .iter()
472 .map(|&(start, end)| QueryRun::new(self, start, end))
473 .collect()
474 }
475
476 fn with_source_options(
477 text: &str,
478 index: &'a LicenseIndex,
479 line_threshold: usize,
480 binary_derived: Option<bool>,
481 deadline: Option<Instant>,
482 ) -> Result<Self, anyhow::Error> {
483 crate::license_detection::ensure_within_deadline(deadline)?;
484 let is_binary = match binary_derived {
485 Some(is_binary) => is_binary,
486 None => Self::detect_binary(text)?,
487 };
488 let has_long_lines = Self::detect_long_lines(text);
489
490 let mut tokens = Vec::new();
491 let mut line_by_pos = Vec::new();
492 let mut unknowns_by_pos: HashMap<Option<usize>, usize> = HashMap::new();
493 let mut stopwords_by_pos: HashMap<Option<usize>, usize> = HashMap::new();
494 let mut shorts_and_digits_pos = PositionSet::new();
495 let mut spdx_lines: Vec<(String, usize, usize)> = Vec::new();
496
497 let mut known_pos: Option<usize> = None;
498 let mut started = false;
499 let mut current_line = 1usize;
500
501 let mut tokens_by_line: Vec<Vec<Option<KnownToken>>> = Vec::new();
502
503 for (line_index, line) in text.lines().enumerate() {
504 if line_index.is_multiple_of(128) {
505 crate::license_detection::ensure_within_deadline(deadline)?;
506 }
507
508 let line_trimmed = line.trim();
509 let mut line_tokens: Vec<Option<KnownToken>> = Vec::new();
510
511 let mut line_first_known_pos = None;
512
513 let line_query_tokens = tokenize_as_ids(line_trimmed, &index.dictionary);
514
515 for query_token in &line_query_tokens {
516 match query_token {
517 QueryToken::Known(known_token) => {
518 known_pos = Some(known_pos.map_or(0, |p| p + 1));
519 started = true;
520 tokens.push(known_token.id);
521 line_by_pos.push(current_line);
522 line_tokens.push(Some(*known_token));
523
524 if line_first_known_pos.is_none() {
525 line_first_known_pos = known_pos;
526 }
527
528 if known_token.is_short_or_digit {
529 let _ = shorts_and_digits_pos.insert(known_pos.unwrap());
530 }
531 }
532 QueryToken::Unknown if !started => {
533 *unknowns_by_pos.entry(None).or_insert(0) += 1;
534 line_tokens.push(None);
535 }
536 QueryToken::Unknown => {
537 *unknowns_by_pos.entry(known_pos).or_insert(0) += 1;
538 line_tokens.push(None);
539 }
540 QueryToken::Stopword if !started => {
541 *stopwords_by_pos.entry(None).or_insert(0) += 1;
542 }
543 QueryToken::Stopword => {
544 *stopwords_by_pos.entry(known_pos).or_insert(0) += 1;
545 }
546 }
547 }
548
549 let line_last_known_pos = known_pos;
550
551 let spdx_start_offset =
552 Self::compute_spdx_offset(&line_query_tokens, &index.dictionary);
553
554 if let Some(offset) = spdx_start_offset
555 && let Some(line_first_known_pos) = line_first_known_pos
556 {
557 let (spdx_prefix, spdx_expression) = split_spdx_lid(line);
558 let spdx_text = format!("{}{}", spdx_prefix.unwrap_or_default(), spdx_expression);
559 let spdx_start_known_pos = line_first_known_pos + offset;
560
561 if spdx_start_known_pos <= line_last_known_pos.unwrap() {
562 let spdx_end = line_last_known_pos.unwrap() + 1;
563 spdx_lines.push((spdx_text, spdx_start_known_pos, spdx_end));
564 }
565 }
566
567 tokens_by_line.push(line_tokens);
568 current_line += 1;
569 }
570
571 crate::license_detection::ensure_within_deadline(deadline)?;
572
573 let high_matchables: PositionSet = tokens
574 .iter()
575 .enumerate()
576 .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Legalese)
577 .map(|(pos, _tid)| pos)
578 .collect();
579
580 let low_matchables: PositionSet = tokens
581 .iter()
582 .enumerate()
583 .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Regular)
584 .map(|(pos, _tid)| pos)
585 .collect();
586
587 let query_runs = Self::compute_query_runs(&tokens_by_line, line_threshold, has_long_lines);
588
589 Ok(Query {
590 text: text.to_string(),
591 tokens,
592 line_by_pos,
593 unknowns_by_pos,
594 stopwords_by_pos,
595 shorts_and_digits_pos,
596 high_matchables,
597 low_matchables,
598 is_binary,
599 query_run_ranges: query_runs,
600 spdx_lines,
601 index,
602 })
603 }
604
605 fn detect_binary(text: &str) -> Result<bool, anyhow::Error> {
619 let null_byte_count = text.bytes().filter(|&b| b == 0).count();
620
621 if null_byte_count > 0 {
622 return Ok(true);
623 }
624
625 let non_printable_ratio = text
626 .chars()
627 .filter(|&c| {
628 !c.is_ascii() && !c.is_ascii_graphic() && c != '\n' && c != '\r' && c != '\t'
629 })
630 .count() as f64
631 / text.len().max(1) as f64;
632
633 Ok(non_printable_ratio > 0.3)
634 }
635
636 fn detect_long_lines(text: &str) -> bool {
646 text.lines()
647 .any(|line| crate::license_detection::tokenize::count_tokens(line) > 25)
648 }
649
650 fn break_long_lines(lines: &[Vec<Option<KnownToken>>]) -> Vec<Vec<Option<KnownToken>>> {
651 lines
652 .iter()
653 .flat_map(|line| {
654 if line.is_empty() {
655 return Vec::new();
656 }
657
658 if line.len() <= Self::MAX_TOKEN_PER_LINE {
659 vec![line.clone()]
660 } else {
661 line.chunks(Self::MAX_TOKEN_PER_LINE)
662 .map(|chunk| chunk.to_vec())
663 .collect()
664 }
665 })
666 .collect()
667 }
668
669 fn compute_query_runs(
670 tokens_by_line: &[Vec<Option<KnownToken>>],
671 line_threshold: usize,
672 has_long_lines: bool,
673 ) -> Vec<(usize, Option<usize>)> {
674 let processed_lines = if has_long_lines {
675 Self::break_long_lines(tokens_by_line)
676 } else {
677 tokens_by_line.to_vec()
678 };
679
680 let mut query_runs = Vec::new();
681 let mut query_run_start = 0usize;
682 let mut query_run_end = None;
683 let mut empty_lines = 0usize;
684 let mut pos = 0usize;
685 let mut query_run_is_all_digit = true;
686
687 for line_tokens in processed_lines {
688 if query_run_end.is_some() && empty_lines >= line_threshold {
689 if !query_run_is_all_digit {
690 query_runs.push((query_run_start, query_run_end));
691 }
692 query_run_start = pos;
693 query_run_end = None;
694 empty_lines = 0;
695 query_run_is_all_digit = true;
696 }
697
698 if query_run_end.is_none() {
699 query_run_start = pos;
700 }
701
702 if line_tokens.is_empty() {
703 empty_lines += 1;
704 continue;
705 }
706
707 let line_is_all_digit = line_tokens
708 .iter()
709 .all(|token_id| token_id.map(|known| known.is_digit_only).unwrap_or(true));
710 let mut line_has_known_tokens = false;
711 let mut line_has_good_tokens = false;
712
713 for known in line_tokens.into_iter().flatten() {
714 line_has_known_tokens = true;
715 if known.kind == TokenKind::Legalese {
716 line_has_good_tokens = true;
717 }
718 if !known.is_digit_only {
719 query_run_is_all_digit = false;
720 }
721 query_run_end = Some(pos);
722 pos += 1;
723 }
724
725 if line_is_all_digit || !line_has_known_tokens {
726 empty_lines += 1;
727 continue;
728 }
729
730 if line_has_good_tokens {
731 empty_lines = 0;
732 } else {
733 empty_lines += 1;
734 }
735 }
736
737 if let Some(end) = query_run_end
738 && !query_run_is_all_digit
739 {
740 query_runs.push((query_run_start, Some(end)));
741 }
742
743 query_runs
744 }
745
746 #[inline]
756 pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
757 self.line_by_pos.get(pos).copied()
758 }
759
760 #[inline]
762 pub fn is_empty(&self) -> bool {
763 self.tokens.is_empty()
764 }
765
766 pub fn whole_query_run(&self) -> QueryRun<'a> {
770 QueryRun::whole_query_snapshot(self)
771 }
772
773 pub fn subtract(&mut self, span: &PositionSpan) {
782 self.high_matchables.remove_span(span);
783 self.low_matchables.remove_span(span);
784 }
785
786 pub fn matched_text(&self, start_line: usize, end_line: usize) -> String {
800 matched_text_from_text(&self.text, start_line, end_line)
801 }
802}
803
804#[derive(Debug, Clone)]
805struct WholeQueryRunSnapshot<'a> {
806 index: &'a LicenseIndex,
807 tokens: Vec<TokenId>,
808 line_by_pos: Vec<usize>,
809 high_matchables: PositionSet,
810 low_matchables: PositionSet,
811}
812
813#[derive(Debug, Clone)]
821pub struct QueryRun<'a> {
822 query: Option<&'a Query<'a>>,
823 whole_query_snapshot: Option<WholeQueryRunSnapshot<'a>>,
824 pub start: usize,
825 pub end: Option<usize>,
826 cached_high_matchables: OnceCell<PositionSet>,
827 cached_low_matchables: OnceCell<PositionSet>,
828 combined_matchables: RefCell<Option<PositionSet>>,
829}
830
831impl<'a> QueryRun<'a> {
832 pub fn new(query: &'a Query<'a>, start: usize, end: Option<usize>) -> Self {
841 Self {
842 query: Some(query),
843 whole_query_snapshot: None,
844 start,
845 end,
846 cached_high_matchables: OnceCell::new(),
847 cached_low_matchables: OnceCell::new(),
848 combined_matchables: RefCell::new(None),
849 }
850 }
851
852 fn whole_query_snapshot(query: &Query<'a>) -> Self {
853 let end = if query.is_empty() {
854 None
855 } else {
856 Some(query.tokens.len() - 1)
857 };
858
859 Self {
860 query: None,
861 whole_query_snapshot: Some(WholeQueryRunSnapshot {
862 index: query.index,
863 tokens: query.tokens.clone(),
864 line_by_pos: query.line_by_pos.clone(),
865 high_matchables: query.high_matchables.clone(),
866 low_matchables: query.low_matchables.clone(),
867 }),
868 start: 0,
869 end,
870 cached_high_matchables: OnceCell::new(),
871 cached_low_matchables: OnceCell::new(),
872 combined_matchables: RefCell::new(None),
873 }
874 }
875
876 fn source_tokens(&self) -> &[TokenId] {
877 if let Some(query) = self.query {
878 &query.tokens
879 } else {
880 &self
881 .whole_query_snapshot
882 .as_ref()
883 .expect("snapshot-backed whole query run should have snapshot data")
884 .tokens
885 }
886 }
887
888 fn source_line_by_pos(&self) -> &[usize] {
889 if let Some(query) = self.query {
890 &query.line_by_pos
891 } else {
892 &self
893 .whole_query_snapshot
894 .as_ref()
895 .expect("snapshot-backed whole query run should have snapshot data")
896 .line_by_pos
897 }
898 }
899
900 fn source_high_matchables(&self) -> &PositionSet {
901 if let Some(query) = self.query {
902 &query.high_matchables
903 } else {
904 &self
905 .whole_query_snapshot
906 .as_ref()
907 .expect("snapshot-backed whole query run should have snapshot data")
908 .high_matchables
909 }
910 }
911
912 fn source_low_matchables(&self) -> &PositionSet {
913 if let Some(query) = self.query {
914 &query.low_matchables
915 } else {
916 &self
917 .whole_query_snapshot
918 .as_ref()
919 .expect("snapshot-backed whole query run should have snapshot data")
920 .low_matchables
921 }
922 }
923
924 pub fn get_index(&self) -> &LicenseIndex {
926 if let Some(query) = self.query {
927 query.index
928 } else {
929 self.whole_query_snapshot
930 .as_ref()
931 .expect("snapshot-backed whole query run should have snapshot data")
932 .index
933 }
934 }
935
936 pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
944 self.source_line_by_pos().get(pos).copied()
945 }
946
947 pub fn tokens(&self) -> &[TokenId] {
953 match self.end {
954 Some(end) => &self.source_tokens()[self.start..=end],
955 None => &[],
956 }
957 }
958
959 pub fn tokens_with_pos(&self) -> impl Iterator<Item = (usize, TokenId)> + '_ {
963 self.tokens()
964 .iter()
965 .copied()
966 .enumerate()
967 .map(|(i, tid)| (self.start + i, tid))
968 }
969
970 pub fn is_digits_only(&self) -> bool {
974 self.tokens()
975 .iter()
976 .all(|&tid| self.get_index().dictionary.is_digit_only_token(tid))
977 }
978
979 pub fn is_matchable(&self, include_low: bool, exclude_positions: &[PositionSpan]) -> bool {
989 if self.is_digits_only() {
990 return false;
991 }
992
993 let matchables = self.matchables(include_low);
994
995 if exclude_positions.is_empty() {
996 return !matchables.is_empty();
997 }
998
999 let mut matchable_set = matchables;
1000 for span in exclude_positions {
1001 matchable_set.remove_span(span);
1002 }
1003
1004 !matchable_set.is_empty()
1005 }
1006
1007 pub fn matchables(&self, include_low: bool) -> PositionSet {
1008 if include_low {
1009 if let Some(ref cached) = *self.combined_matchables.borrow() {
1010 return cached.clone();
1011 }
1012 let combined = self.low_matchables().union(&self.high_matchables());
1013 *self.combined_matchables.borrow_mut() = Some(combined.clone());
1014 combined
1015 } else {
1016 self.high_matchables()
1017 }
1018 }
1019
1020 pub fn matchable_tokens(&self) -> Vec<Option<TokenId>> {
1021 let high_matchables = self.high_matchables();
1022 if high_matchables.is_empty() {
1023 return Vec::new();
1024 }
1025
1026 let matchables = self.matchables(true);
1027 self.tokens_with_pos()
1028 .map(|(pos, tid)| {
1029 if matchables.contains(pos) {
1030 Some(tid)
1031 } else {
1032 None
1033 }
1034 })
1035 .collect()
1036 }
1037
1038 pub fn high_matchables(&self) -> PositionSet {
1039 self.cached_high_matchables
1040 .get_or_init(|| {
1041 let start = self.start;
1042 let end = self.end.map(|e| e + 1).unwrap_or(usize::MAX);
1043 let source = self.source_high_matchables();
1044 let live_span = PositionSpan::new(start, end);
1045 source
1046 .iter()
1047 .filter(|&pos| live_span.contains(pos))
1048 .collect()
1049 })
1050 .clone()
1051 }
1052
1053 pub fn low_matchables(&self) -> PositionSet {
1054 self.cached_low_matchables
1055 .get_or_init(|| {
1056 let start = self.start;
1057 let end = self.end.map(|e| e + 1).unwrap_or(usize::MAX);
1058 let source = self.source_low_matchables();
1059 let live_span = PositionSpan::new(start, end);
1060 source
1061 .iter()
1062 .filter(|&pos| live_span.contains(pos))
1063 .collect()
1064 })
1065 .clone()
1066 }
1067}
1068
1069#[cfg(test)]
1070mod test;