1use crate::license_detection::index::LicenseIndex;
4use crate::license_detection::index::dictionary::{KnownToken, QueryToken, TokenId, TokenKind};
5use crate::license_detection::spdx_lid::split_spdx_lid;
6use crate::license_detection::tokenize::STOPWORDS;
7use crate::license_detection::tokenize::tokenize_as_ids;
8use bit_set::BitSet;
9use once_cell::sync::Lazy;
10use regex::Regex;
11use std::cell::{OnceCell, RefCell};
12use std::collections::{HashMap, HashSet};
13
14static QUERY_PATTERN: Lazy<Regex> =
15 Lazy::new(|| Regex::new(r"[^_\W]+\+?[^_\W]*").expect("valid query regex"));
16static MATCHED_TEXT_PATTERN: Lazy<Regex> = Lazy::new(|| {
17 Regex::new(r"(?P<token>[^_\W]+\+?[^_\W]*)|(?P<punct>[_\W\s\+]+[_\W\s]?)")
18 .expect("valid matched text regex")
19});
20
21#[derive(Clone)]
22struct MatchedTextToken {
23 value: String,
24 line_num: usize,
25 pos: Option<usize>,
26 is_text: bool,
27 is_matched: bool,
28}
29
30#[derive(Debug, Clone)]
40pub struct PositionSpan {
41 start: usize,
42 end: usize,
43}
44
45impl PositionSpan {
46 pub fn new(start: usize, end: usize) -> Self {
47 Self { start, end }
48 }
49
50 pub fn contains(&self, pos: usize) -> bool {
51 self.start <= pos && pos <= self.end
52 }
53
54 pub fn iter(&self) -> impl Iterator<Item = usize> + '_ {
55 self.start..=self.end
56 }
57}
58
59#[derive(Debug)]
71pub struct Query<'a> {
72 pub text: String,
76
77 pub tokens: Vec<TokenId>,
81
82 pub line_by_pos: Vec<usize>,
89
90 pub unknowns_by_pos: HashMap<Option<i32>, usize>,
99
100 pub stopwords_by_pos: HashMap<Option<i32>, usize>,
106
107 pub shorts_and_digits_pos: HashSet<usize>,
113
114 pub high_matchables: BitSet,
120
121 pub low_matchables: BitSet,
127
128 pub is_binary: bool,
132
133 pub(crate) query_run_ranges: Vec<(usize, Option<usize>)>,
139
140 pub spdx_lines: Vec<(String, usize, usize)>,
147
148 pub index: &'a LicenseIndex,
150}
151
152pub fn matched_text_from_text(text: &str, start_line: usize, end_line: usize) -> String {
153 if start_line == 0 || end_line == 0 || start_line > end_line {
154 return String::new();
155 }
156
157 text.lines()
158 .enumerate()
159 .filter_map(|(idx, line)| {
160 let line_num = idx + 1;
161 if line_num >= start_line && line_num <= end_line {
162 Some(line)
163 } else {
164 None
165 }
166 })
167 .collect::<Vec<_>>()
168 .join("\n")
169}
170
171pub fn matched_text_diagnostics_from_text(
172 text: &str,
173 query: &Query<'_>,
174 matched_positions: &HashSet<usize>,
175 start_pos: usize,
176 end_pos: usize,
177 start_line: usize,
178 end_line: usize,
179) -> String {
180 let tokens = tokenize_matched_text(text, query);
181 let reportable_tokens = collect_reportable_tokens(
182 tokens,
183 matched_positions,
184 start_pos,
185 end_pos,
186 start_line,
187 end_line,
188 );
189 let line_endings = collect_line_endings(text);
190
191 render_diagnostic_tokens(&reportable_tokens, &line_endings)
192}
193
194fn tokenize_matched_text(text: &str, query: &Query<'_>) -> Vec<MatchedTextToken> {
195 let mut tokens = Vec::new();
196 let mut pos = 0usize;
197 let mut line_num = 1usize;
198
199 for line in text.split_inclusive('\n') {
200 for capture in MATCHED_TEXT_PATTERN.captures_iter(line) {
201 if let Some(token_match) = capture.name("token") {
202 let token_text = token_match.as_str();
203 let retokenized: Vec<String> = QUERY_PATTERN
204 .find_iter(&token_text.to_lowercase())
205 .map(|m| m.as_str().to_string())
206 .filter(|token| !STOPWORDS.contains(token.as_str()))
207 .collect();
208
209 if retokenized.is_empty() {
210 tokens.push(MatchedTextToken {
211 value: token_text.to_string(),
212 line_num,
213 pos: None,
214 is_text: true,
215 is_matched: false,
216 });
217 } else if retokenized.len() == 1 {
218 let token = &retokenized[0];
219 let token_pos = if query.index.dictionary.get(token).is_some() {
220 let current_pos = pos;
221 pos += 1;
222 Some(current_pos)
223 } else {
224 None
225 };
226
227 tokens.push(MatchedTextToken {
228 value: token_text.to_string(),
229 line_num,
230 pos: token_pos,
231 is_text: true,
232 is_matched: false,
233 });
234 } else {
235 for token in retokenized {
236 let token_pos = if query.index.dictionary.get(&token).is_some() {
237 let current_pos = pos;
238 pos += 1;
239 Some(current_pos)
240 } else {
241 None
242 };
243
244 tokens.push(MatchedTextToken {
245 value: token,
246 line_num,
247 pos: token_pos,
248 is_text: true,
249 is_matched: false,
250 });
251 }
252 }
253 } else if let Some(punct_match) = capture.name("punct") {
254 tokens.push(MatchedTextToken {
255 value: punct_match.as_str().to_string(),
256 line_num,
257 pos: None,
258 is_text: false,
259 is_matched: false,
260 });
261 }
262 }
263
264 line_num += 1;
265 }
266
267 tokens
268}
269
270fn collect_reportable_tokens(
271 tokens: Vec<MatchedTextToken>,
272 matched_positions: &HashSet<usize>,
273 start_pos: usize,
274 end_pos: usize,
275 start_line: usize,
276 end_line: usize,
277) -> Vec<MatchedTextToken> {
278 let mut reportable = Vec::new();
279 let mut started = false;
280 let mut finished = false;
281 let mut end_real_pos = None;
282 let mut last_real_pos = None;
283
284 for (real_pos, mut token) in tokens.into_iter().enumerate() {
285 if token.line_num < start_line {
286 continue;
287 }
288
289 if token.line_num > end_line {
290 break;
291 }
292
293 let mut is_included = false;
294
295 if token
296 .pos
297 .is_some_and(|pos| matched_positions.contains(&pos))
298 {
299 token.is_matched = true;
300 is_included = true;
301 }
302
303 if !started && token.pos == Some(start_pos) {
304 started = true;
305 is_included = true;
306 }
307
308 if started && !finished {
309 is_included = true;
310 }
311
312 if token.pos == Some(end_pos) {
313 finished = true;
314 started = false;
315 end_real_pos = Some(real_pos);
316 }
317
318 if finished && !started && end_real_pos.is_some() && last_real_pos == end_real_pos {
319 end_real_pos = None;
320 if !token.is_text && !token.value.trim().is_empty() {
321 is_included = true;
322 }
323 }
324
325 last_real_pos = Some(real_pos);
326
327 if is_included {
328 reportable.push(token);
329 }
330 }
331
332 reportable
333}
334
335fn collect_line_endings(text: &str) -> Vec<String> {
336 text.split_inclusive('\n')
337 .map(|line| {
338 if line.ends_with("\r\n") {
339 "\r\n".to_string()
340 } else if line.ends_with('\n') {
341 "\n".to_string()
342 } else {
343 String::new()
344 }
345 })
346 .collect()
347}
348
349fn render_diagnostic_tokens(tokens: &[MatchedTextToken], line_endings: &[String]) -> String {
350 let mut rendered = String::new();
351 let mut previous_line: Option<usize> = None;
352
353 for token in tokens {
354 if let Some(prev_line) = previous_line
355 && token.line_num > prev_line
356 {
357 for line in prev_line..token.line_num {
358 if let Some(line_ending) = line_endings.get(line.saturating_sub(1)) {
359 rendered.push_str(line_ending.as_str());
360 }
361 }
362 }
363
364 let token_value = if token.is_text {
365 token.value.as_str()
366 } else {
367 token
368 .value
369 .strip_suffix("\r\n")
370 .or_else(|| token.value.strip_suffix('\n'))
371 .unwrap_or(token.value.as_str())
372 };
373
374 if token.is_text && !STOPWORDS.contains(token.value.to_lowercase().as_str()) {
375 if token.is_matched {
376 rendered.push_str(token_value);
377 } else {
378 rendered.push('[');
379 rendered.push_str(token_value);
380 rendered.push(']');
381 }
382 } else {
383 rendered.push_str(token_value);
384 }
385
386 previous_line = Some(token.line_num);
387 }
388
389 rendered
390}
391
392impl<'a> Query<'a> {
393 const TEXT_LINE_THRESHOLD: usize = 15;
408 const BINARY_LINE_THRESHOLD: usize = 50;
409 const MAX_TOKEN_PER_LINE: usize = 25;
410
411 fn compute_spdx_offset(
412 tokens: &[QueryToken],
413 dictionary: &crate::license_detection::index::dictionary::TokenDictionary,
414 ) -> Option<usize> {
415 let get_known_id = |i: usize| -> Option<TokenId> {
416 match tokens.get(i)? {
417 QueryToken::Known(known) => Some(known.id),
418 _ => None,
419 }
420 };
421
422 let spdx_id = dictionary.get("spdx")?;
423 let license_id = dictionary.get("license")?;
424 let identifier_id = dictionary.get("identifier")?;
425 let licence_id = dictionary.get("licence");
426
427 let licenses_id = dictionary.get("licenses");
428 let nuget_id = dictionary.get("nuget");
429 let org_id = dictionary.get("org");
430
431 let is_spdx_prefix = |ids: [Option<TokenId>; 3]| -> bool {
432 ids.iter().all(|id| id.is_some())
433 && ids[0] == Some(spdx_id)
434 && (ids[1] == Some(license_id) || ids[1] == licence_id)
435 && ids[2] == Some(identifier_id)
436 };
437
438 let is_nuget_prefix = |ids: [Option<TokenId>; 3]| -> bool {
439 licenses_id.is_some()
440 && nuget_id.is_some()
441 && org_id.is_some()
442 && ids[0] == licenses_id
443 && ids[1] == Some(nuget_id.unwrap())
444 && ids[2] == Some(org_id.unwrap())
445 };
446
447 if tokens.len() >= 3 {
448 let first_three = [get_known_id(0), get_known_id(1), get_known_id(2)];
449 if is_spdx_prefix(first_three) || is_nuget_prefix(first_three) {
450 return Some(0);
451 }
452 }
453
454 if tokens.len() >= 4 {
455 let second_three = [get_known_id(1), get_known_id(2), get_known_id(3)];
456 if is_spdx_prefix(second_three) || is_nuget_prefix(second_three) {
457 return Some(1);
458 }
459 }
460
461 if tokens.len() >= 5 {
462 let third_three = [get_known_id(2), get_known_id(3), get_known_id(4)];
463 if is_spdx_prefix(third_three) || is_nuget_prefix(third_three) {
464 return Some(2);
465 }
466 }
467
468 None
469 }
470
471 pub fn from_extracted_text(
472 text: &str,
473 index: &'a LicenseIndex,
474 binary_derived: bool,
475 ) -> Result<Self, anyhow::Error> {
476 let line_threshold = if binary_derived {
477 Self::BINARY_LINE_THRESHOLD
478 } else {
479 Self::TEXT_LINE_THRESHOLD
480 };
481
482 Self::with_source_options(text, index, line_threshold, Some(binary_derived))
483 }
484
485 pub fn query_runs(&self) -> Vec<QueryRun<'_>> {
489 self.query_run_ranges
490 .iter()
491 .map(|&(start, end)| QueryRun::new(self, start, end))
492 .collect()
493 }
494
495 fn with_source_options(
496 text: &str,
497 index: &'a LicenseIndex,
498 line_threshold: usize,
499 binary_derived: Option<bool>,
500 ) -> Result<Self, anyhow::Error> {
501 let is_binary = match binary_derived {
502 Some(is_binary) => is_binary,
503 None => Self::detect_binary(text)?,
504 };
505 let has_long_lines = Self::detect_long_lines(text);
506
507 let mut tokens = Vec::new();
508 let mut line_by_pos = Vec::new();
509 let mut unknowns_by_pos: HashMap<Option<i32>, usize> = HashMap::new();
510 let mut stopwords_by_pos: HashMap<Option<i32>, usize> = HashMap::new();
511 let mut shorts_and_digits_pos = HashSet::new();
512 let mut spdx_lines: Vec<(String, usize, usize)> = Vec::new();
513
514 let mut known_pos = -1i32;
515 let mut started = false;
516 let mut current_line = 1usize;
517
518 let mut tokens_by_line: Vec<Vec<Option<KnownToken>>> = Vec::new();
519
520 for line in text.lines() {
521 let line_trimmed = line.trim();
522 let mut line_tokens: Vec<Option<KnownToken>> = Vec::new();
523
524 let mut line_first_known_pos = None;
525
526 let line_query_tokens = tokenize_as_ids(line_trimmed, &index.dictionary);
527
528 for query_token in &line_query_tokens {
529 match query_token {
530 QueryToken::Known(known_token) => {
531 known_pos += 1;
532 started = true;
533 tokens.push(known_token.id);
534 line_by_pos.push(current_line);
535 line_tokens.push(Some(*known_token));
536
537 if line_first_known_pos.is_none() {
538 line_first_known_pos = Some(known_pos);
539 }
540
541 if known_token.is_short_or_digit {
542 let _ = shorts_and_digits_pos.insert(known_pos as usize);
543 }
544 }
545 QueryToken::Unknown if !started => {
546 *unknowns_by_pos.entry(None).or_insert(0) += 1;
547 line_tokens.push(None);
548 }
549 QueryToken::Unknown => {
550 *unknowns_by_pos.entry(Some(known_pos)).or_insert(0) += 1;
551 line_tokens.push(None);
552 }
553 QueryToken::Stopword if !started => {
554 *stopwords_by_pos.entry(None).or_insert(0) += 1;
555 }
556 QueryToken::Stopword => {
557 *stopwords_by_pos.entry(Some(known_pos)).or_insert(0) += 1;
558 }
559 }
560 }
561
562 let line_last_known_pos = known_pos;
563
564 let spdx_start_offset =
565 Self::compute_spdx_offset(&line_query_tokens, &index.dictionary);
566
567 if let Some(offset) = spdx_start_offset
568 && let Some(line_first_known_pos) = line_first_known_pos
569 {
570 let (spdx_prefix, spdx_expression) = split_spdx_lid(line);
571 let spdx_text = format!("{}{}", spdx_prefix.unwrap_or_default(), spdx_expression);
572 let spdx_start_known_pos = line_first_known_pos + offset as i32;
573
574 if spdx_start_known_pos <= line_last_known_pos {
575 let spdx_start = spdx_start_known_pos as usize;
576 let spdx_end = (line_last_known_pos + 1) as usize;
577 spdx_lines.push((spdx_text, spdx_start, spdx_end));
578 }
579 }
580
581 tokens_by_line.push(line_tokens);
582 current_line += 1;
583 }
584
585 let high_matchables: BitSet = tokens
586 .iter()
587 .enumerate()
588 .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Legalese)
589 .map(|(pos, _tid)| pos)
590 .collect();
591
592 let low_matchables: BitSet = tokens
593 .iter()
594 .enumerate()
595 .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Regular)
596 .map(|(pos, _tid)| pos)
597 .collect();
598
599 let query_runs = Self::compute_query_runs(&tokens_by_line, line_threshold, has_long_lines);
600
601 Ok(Query {
602 text: text.to_string(),
603 tokens,
604 line_by_pos,
605 unknowns_by_pos,
606 stopwords_by_pos,
607 shorts_and_digits_pos,
608 high_matchables,
609 low_matchables,
610 is_binary,
611 query_run_ranges: query_runs,
612 spdx_lines,
613 index,
614 })
615 }
616
617 fn detect_binary(text: &str) -> Result<bool, anyhow::Error> {
631 let null_byte_count = text.bytes().filter(|&b| b == 0).count();
632
633 if null_byte_count > 0 {
634 return Ok(true);
635 }
636
637 let non_printable_ratio = text
638 .chars()
639 .filter(|&c| {
640 !c.is_ascii() && !c.is_ascii_graphic() && c != '\n' && c != '\r' && c != '\t'
641 })
642 .count() as f64
643 / text.len().max(1) as f64;
644
645 Ok(non_printable_ratio > 0.3)
646 }
647
648 fn detect_long_lines(text: &str) -> bool {
658 text.lines()
659 .any(|line| crate::license_detection::tokenize::count_tokens(line) > 25)
660 }
661
662 fn break_long_lines(lines: &[Vec<Option<KnownToken>>]) -> Vec<Vec<Option<KnownToken>>> {
663 lines
664 .iter()
665 .flat_map(|line| {
666 if line.is_empty() {
667 return Vec::new();
668 }
669
670 if line.len() <= Self::MAX_TOKEN_PER_LINE {
671 vec![line.clone()]
672 } else {
673 line.chunks(Self::MAX_TOKEN_PER_LINE)
674 .map(|chunk| chunk.to_vec())
675 .collect()
676 }
677 })
678 .collect()
679 }
680
681 fn compute_query_runs(
682 tokens_by_line: &[Vec<Option<KnownToken>>],
683 line_threshold: usize,
684 has_long_lines: bool,
685 ) -> Vec<(usize, Option<usize>)> {
686 let processed_lines = if has_long_lines {
687 Self::break_long_lines(tokens_by_line)
688 } else {
689 tokens_by_line.to_vec()
690 };
691
692 let mut query_runs = Vec::new();
693 let mut query_run_start = 0usize;
694 let mut query_run_end = None;
695 let mut empty_lines = 0usize;
696 let mut pos = 0usize;
697 let mut query_run_is_all_digit = true;
698
699 for line_tokens in processed_lines {
700 if query_run_end.is_some() && empty_lines >= line_threshold {
701 if !query_run_is_all_digit {
702 query_runs.push((query_run_start, query_run_end));
703 }
704 query_run_start = pos;
705 query_run_end = None;
706 empty_lines = 0;
707 query_run_is_all_digit = true;
708 }
709
710 if query_run_end.is_none() {
711 query_run_start = pos;
712 }
713
714 if line_tokens.is_empty() {
715 empty_lines += 1;
716 continue;
717 }
718
719 let line_is_all_digit = line_tokens
720 .iter()
721 .all(|token_id| token_id.map(|known| known.is_digit_only).unwrap_or(true));
722 let mut line_has_known_tokens = false;
723 let mut line_has_good_tokens = false;
724
725 for known in line_tokens.into_iter().flatten() {
726 line_has_known_tokens = true;
727 if known.kind == TokenKind::Legalese {
728 line_has_good_tokens = true;
729 }
730 if !known.is_digit_only {
731 query_run_is_all_digit = false;
732 }
733 query_run_end = Some(pos);
734 pos += 1;
735 }
736
737 if line_is_all_digit || !line_has_known_tokens {
738 empty_lines += 1;
739 continue;
740 }
741
742 if line_has_good_tokens {
743 empty_lines = 0;
744 } else {
745 empty_lines += 1;
746 }
747 }
748
749 if let Some(end) = query_run_end
750 && !query_run_is_all_digit
751 {
752 query_runs.push((query_run_start, Some(end)));
753 }
754
755 query_runs
756 }
757
758 #[inline]
768 pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
769 self.line_by_pos.get(pos).copied()
770 }
771
772 #[inline]
774 pub fn is_empty(&self) -> bool {
775 self.tokens.is_empty()
776 }
777
778 pub fn whole_query_run(&self) -> QueryRun<'a> {
782 QueryRun::whole_query_snapshot(self)
783 }
784
785 pub fn subtract(&mut self, span: &PositionSpan) {
794 for pos in span.iter() {
795 self.high_matchables.remove(pos);
796 self.low_matchables.remove(pos);
797 }
798 }
799
800 pub fn matched_text(&self, start_line: usize, end_line: usize) -> String {
814 matched_text_from_text(&self.text, start_line, end_line)
815 }
816}
817
818#[derive(Debug, Clone)]
819struct WholeQueryRunSnapshot<'a> {
820 index: &'a LicenseIndex,
821 tokens: Vec<TokenId>,
822 line_by_pos: Vec<usize>,
823 high_matchables: BitSet,
824 low_matchables: BitSet,
825}
826
827#[derive(Debug, Clone)]
835pub struct QueryRun<'a> {
836 query: Option<&'a Query<'a>>,
837 whole_query_snapshot: Option<WholeQueryRunSnapshot<'a>>,
838 pub start: usize,
839 pub end: Option<usize>,
840 cached_high_matchables: OnceCell<BitSet>,
841 cached_low_matchables: OnceCell<BitSet>,
842 combined_matchables: RefCell<Option<BitSet>>,
843}
844
845impl<'a> QueryRun<'a> {
846 pub fn new(query: &'a Query<'a>, start: usize, end: Option<usize>) -> Self {
855 Self {
856 query: Some(query),
857 whole_query_snapshot: None,
858 start,
859 end,
860 cached_high_matchables: OnceCell::new(),
861 cached_low_matchables: OnceCell::new(),
862 combined_matchables: RefCell::new(None),
863 }
864 }
865
866 fn whole_query_snapshot(query: &Query<'a>) -> Self {
867 let end = if query.is_empty() {
868 None
869 } else {
870 Some(query.tokens.len() - 1)
871 };
872
873 Self {
874 query: None,
875 whole_query_snapshot: Some(WholeQueryRunSnapshot {
876 index: query.index,
877 tokens: query.tokens.clone(),
878 line_by_pos: query.line_by_pos.clone(),
879 high_matchables: query.high_matchables.clone(),
880 low_matchables: query.low_matchables.clone(),
881 }),
882 start: 0,
883 end,
884 cached_high_matchables: OnceCell::new(),
885 cached_low_matchables: OnceCell::new(),
886 combined_matchables: RefCell::new(None),
887 }
888 }
889
890 fn source_tokens(&self) -> &[TokenId] {
891 if let Some(query) = self.query {
892 &query.tokens
893 } else {
894 &self
895 .whole_query_snapshot
896 .as_ref()
897 .expect("snapshot-backed whole query run should have snapshot data")
898 .tokens
899 }
900 }
901
902 fn source_line_by_pos(&self) -> &[usize] {
903 if let Some(query) = self.query {
904 &query.line_by_pos
905 } else {
906 &self
907 .whole_query_snapshot
908 .as_ref()
909 .expect("snapshot-backed whole query run should have snapshot data")
910 .line_by_pos
911 }
912 }
913
914 fn source_high_matchables(&self) -> &BitSet {
915 if let Some(query) = self.query {
916 &query.high_matchables
917 } else {
918 &self
919 .whole_query_snapshot
920 .as_ref()
921 .expect("snapshot-backed whole query run should have snapshot data")
922 .high_matchables
923 }
924 }
925
926 fn source_low_matchables(&self) -> &BitSet {
927 if let Some(query) = self.query {
928 &query.low_matchables
929 } else {
930 &self
931 .whole_query_snapshot
932 .as_ref()
933 .expect("snapshot-backed whole query run should have snapshot data")
934 .low_matchables
935 }
936 }
937
938 pub fn get_index(&self) -> &LicenseIndex {
940 if let Some(query) = self.query {
941 query.index
942 } else {
943 self.whole_query_snapshot
944 .as_ref()
945 .expect("snapshot-backed whole query run should have snapshot data")
946 .index
947 }
948 }
949
950 pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
958 self.source_line_by_pos().get(pos).copied()
959 }
960
961 pub fn tokens(&self) -> &[TokenId] {
967 match self.end {
968 Some(end) => &self.source_tokens()[self.start..=end],
969 None => &[],
970 }
971 }
972
973 pub fn tokens_with_pos(&self) -> impl Iterator<Item = (usize, TokenId)> + '_ {
977 self.tokens()
978 .iter()
979 .copied()
980 .enumerate()
981 .map(|(i, tid)| (self.start + i, tid))
982 }
983
984 pub fn is_digits_only(&self) -> bool {
988 self.tokens()
989 .iter()
990 .all(|&tid| self.get_index().dictionary.is_digit_only_token(tid))
991 }
992
993 pub fn is_matchable(&self, include_low: bool, exclude_positions: &[PositionSpan]) -> bool {
1003 if self.is_digits_only() {
1004 return false;
1005 }
1006
1007 let matchables = self.matchables(include_low);
1008
1009 if exclude_positions.is_empty() {
1010 return !matchables.is_empty();
1011 }
1012
1013 let mut matchable_set = matchables;
1014 for span in exclude_positions {
1015 for pos in span.iter() {
1016 matchable_set.remove(pos);
1017 }
1018 }
1019
1020 !matchable_set.is_empty()
1021 }
1022
1023 pub fn matchables(&self, include_low: bool) -> BitSet {
1024 if include_low {
1025 if let Some(ref cached) = *self.combined_matchables.borrow() {
1026 return cached.clone();
1027 }
1028 let combined: BitSet = self
1029 .low_matchables()
1030 .union(&self.high_matchables())
1031 .collect();
1032 *self.combined_matchables.borrow_mut() = Some(combined.clone());
1033 combined
1034 } else {
1035 self.high_matchables()
1036 }
1037 }
1038
1039 pub fn matchable_tokens(&self) -> Vec<i32> {
1040 let high_matchables = self.high_matchables();
1041 if high_matchables.is_empty() {
1042 return Vec::new();
1043 }
1044
1045 let matchables = self.matchables(true);
1046 self.tokens_with_pos()
1047 .map(|(pos, tid)| {
1048 if matchables.contains(pos) {
1049 tid.raw() as i32
1050 } else {
1051 -1
1052 }
1053 })
1054 .collect()
1055 }
1056
1057 pub fn high_matchables(&self) -> BitSet {
1058 self.cached_high_matchables
1059 .get_or_init(|| {
1060 let start = self.start;
1061 let end = self.end;
1062 let source = self.source_high_matchables();
1063 let live_span = PositionSpan::new(start, end.unwrap_or(usize::MAX));
1064 source
1065 .iter()
1066 .filter(|&pos| live_span.contains(pos))
1067 .collect()
1068 })
1069 .clone()
1070 }
1071
1072 pub fn low_matchables(&self) -> BitSet {
1073 self.cached_low_matchables
1074 .get_or_init(|| {
1075 let start = self.start;
1076 let end = self.end;
1077 let source = self.source_low_matchables();
1078 let live_span = PositionSpan::new(start, end.unwrap_or(usize::MAX));
1079 source
1080 .iter()
1081 .filter(|&pos| live_span.contains(pos))
1082 .collect()
1083 })
1084 .clone()
1085 }
1086}
1087
1088#[cfg(test)]
1089mod test;