1use crate::license_detection::index::LicenseIndex;
4use crate::license_detection::index::dictionary::{KnownToken, QueryToken, TokenId, TokenKind};
5use crate::license_detection::tokenize::STOPWORDS;
6use crate::license_detection::tokenize::tokenize_as_ids;
7use bit_set::BitSet;
8use once_cell::sync::Lazy;
9use regex::Regex;
10use std::cell::{OnceCell, RefCell};
11use std::collections::{HashMap, HashSet};
12
13static QUERY_PATTERN: Lazy<Regex> =
14 Lazy::new(|| Regex::new(r"[^_\W]+\+?[^_\W]*").expect("valid query regex"));
15static MATCHED_TEXT_PATTERN: Lazy<Regex> = Lazy::new(|| {
16 Regex::new(r"(?P<token>[^_\W]+\+?[^_\W]*)|(?P<punct>[_\W\s\+]+[_\W\s]?)")
17 .expect("valid matched text regex")
18});
19
20#[derive(Clone)]
21struct MatchedTextToken {
22 value: String,
23 line_num: usize,
24 pos: Option<usize>,
25 is_text: bool,
26 is_matched: bool,
27}
28
29#[derive(Debug, Clone)]
39pub struct PositionSpan {
40 start: usize,
41 end: usize,
42}
43
44impl PositionSpan {
45 pub fn new(start: usize, end: usize) -> Self {
46 Self { start, end }
47 }
48
49 pub fn contains(&self, pos: usize) -> bool {
50 self.start <= pos && pos <= self.end
51 }
52
53 pub fn iter(&self) -> impl Iterator<Item = usize> + '_ {
54 self.start..=self.end
55 }
56}
57
58#[derive(Debug)]
70pub struct Query<'a> {
71 pub text: String,
75
76 pub tokens: Vec<TokenId>,
80
81 pub line_by_pos: Vec<usize>,
88
89 pub unknowns_by_pos: HashMap<Option<i32>, usize>,
98
99 pub stopwords_by_pos: HashMap<Option<i32>, usize>,
105
106 pub shorts_and_digits_pos: HashSet<usize>,
112
113 pub high_matchables: BitSet,
119
120 pub low_matchables: BitSet,
126
127 pub is_binary: bool,
131
132 pub(crate) query_run_ranges: Vec<(usize, Option<usize>)>,
138
139 pub spdx_lines: Vec<(String, usize, usize)>,
146
147 pub index: &'a LicenseIndex,
149}
150
151pub fn matched_text_from_text(text: &str, start_line: usize, end_line: usize) -> String {
152 if start_line == 0 || end_line == 0 || start_line > end_line {
153 return String::new();
154 }
155
156 text.lines()
157 .enumerate()
158 .filter_map(|(idx, line)| {
159 let line_num = idx + 1;
160 if line_num >= start_line && line_num <= end_line {
161 Some(line)
162 } else {
163 None
164 }
165 })
166 .collect::<Vec<_>>()
167 .join("\n")
168}
169
170pub fn matched_text_diagnostics_from_text(
171 text: &str,
172 query: &Query<'_>,
173 matched_positions: &HashSet<usize>,
174 start_pos: usize,
175 end_pos: usize,
176 start_line: usize,
177 end_line: usize,
178) -> String {
179 let tokens = tokenize_matched_text(text, query);
180 let reportable_tokens = collect_reportable_tokens(
181 tokens,
182 matched_positions,
183 start_pos,
184 end_pos,
185 start_line,
186 end_line,
187 );
188 let line_endings = collect_line_endings(text);
189
190 render_diagnostic_tokens(&reportable_tokens, &line_endings)
191}
192
193fn tokenize_matched_text(text: &str, query: &Query<'_>) -> Vec<MatchedTextToken> {
194 let mut tokens = Vec::new();
195 let mut pos = 0usize;
196 let mut line_num = 1usize;
197
198 for line in text.split_inclusive('\n') {
199 for capture in MATCHED_TEXT_PATTERN.captures_iter(line) {
200 if let Some(token_match) = capture.name("token") {
201 let token_text = token_match.as_str();
202 let retokenized: Vec<String> = QUERY_PATTERN
203 .find_iter(&token_text.to_lowercase())
204 .map(|m| m.as_str().to_string())
205 .filter(|token| !STOPWORDS.contains(token.as_str()))
206 .collect();
207
208 if retokenized.is_empty() {
209 tokens.push(MatchedTextToken {
210 value: token_text.to_string(),
211 line_num,
212 pos: None,
213 is_text: true,
214 is_matched: false,
215 });
216 } else if retokenized.len() == 1 {
217 let token = &retokenized[0];
218 let token_pos = if query.index.dictionary.get(token).is_some() {
219 let current_pos = pos;
220 pos += 1;
221 Some(current_pos)
222 } else {
223 None
224 };
225
226 tokens.push(MatchedTextToken {
227 value: token_text.to_string(),
228 line_num,
229 pos: token_pos,
230 is_text: true,
231 is_matched: false,
232 });
233 } else {
234 for token in retokenized {
235 let token_pos = if query.index.dictionary.get(&token).is_some() {
236 let current_pos = pos;
237 pos += 1;
238 Some(current_pos)
239 } else {
240 None
241 };
242
243 tokens.push(MatchedTextToken {
244 value: token,
245 line_num,
246 pos: token_pos,
247 is_text: true,
248 is_matched: false,
249 });
250 }
251 }
252 } else if let Some(punct_match) = capture.name("punct") {
253 tokens.push(MatchedTextToken {
254 value: punct_match.as_str().to_string(),
255 line_num,
256 pos: None,
257 is_text: false,
258 is_matched: false,
259 });
260 }
261 }
262
263 line_num += 1;
264 }
265
266 tokens
267}
268
269fn collect_reportable_tokens(
270 tokens: Vec<MatchedTextToken>,
271 matched_positions: &HashSet<usize>,
272 start_pos: usize,
273 end_pos: usize,
274 start_line: usize,
275 end_line: usize,
276) -> Vec<MatchedTextToken> {
277 let mut reportable = Vec::new();
278 let mut started = false;
279 let mut finished = false;
280 let mut end_real_pos = None;
281 let mut last_real_pos = None;
282
283 for (real_pos, mut token) in tokens.into_iter().enumerate() {
284 if token.line_num < start_line {
285 continue;
286 }
287
288 if token.line_num > end_line {
289 break;
290 }
291
292 let mut is_included = false;
293
294 if token
295 .pos
296 .is_some_and(|pos| matched_positions.contains(&pos))
297 {
298 token.is_matched = true;
299 is_included = true;
300 }
301
302 if !started && token.pos == Some(start_pos) {
303 started = true;
304 is_included = true;
305 }
306
307 if started && !finished {
308 is_included = true;
309 }
310
311 if token.pos == Some(end_pos) {
312 finished = true;
313 started = false;
314 end_real_pos = Some(real_pos);
315 }
316
317 if finished && !started && end_real_pos.is_some() && last_real_pos == end_real_pos {
318 end_real_pos = None;
319 if !token.is_text && !token.value.trim().is_empty() {
320 is_included = true;
321 }
322 }
323
324 last_real_pos = Some(real_pos);
325
326 if is_included {
327 reportable.push(token);
328 }
329 }
330
331 reportable
332}
333
334fn collect_line_endings(text: &str) -> Vec<String> {
335 text.split_inclusive('\n')
336 .map(|line| {
337 if line.ends_with("\r\n") {
338 "\r\n".to_string()
339 } else if line.ends_with('\n') {
340 "\n".to_string()
341 } else {
342 String::new()
343 }
344 })
345 .collect()
346}
347
348fn render_diagnostic_tokens(tokens: &[MatchedTextToken], line_endings: &[String]) -> String {
349 let mut rendered = String::new();
350 let mut previous_line: Option<usize> = None;
351
352 for token in tokens {
353 if let Some(prev_line) = previous_line
354 && token.line_num > prev_line
355 {
356 for line in prev_line..token.line_num {
357 if let Some(line_ending) = line_endings.get(line.saturating_sub(1)) {
358 rendered.push_str(line_ending.as_str());
359 }
360 }
361 }
362
363 let token_value = if token.is_text {
364 token.value.as_str()
365 } else {
366 token
367 .value
368 .strip_suffix("\r\n")
369 .or_else(|| token.value.strip_suffix('\n'))
370 .unwrap_or(token.value.as_str())
371 };
372
373 if token.is_text && !STOPWORDS.contains(token.value.to_lowercase().as_str()) {
374 if token.is_matched {
375 rendered.push_str(token_value);
376 } else {
377 rendered.push('[');
378 rendered.push_str(token_value);
379 rendered.push(']');
380 }
381 } else {
382 rendered.push_str(token_value);
383 }
384
385 previous_line = Some(token.line_num);
386 }
387
388 rendered
389}
390
391impl<'a> Query<'a> {
392 const TEXT_LINE_THRESHOLD: usize = 15;
407 const BINARY_LINE_THRESHOLD: usize = 50;
408 const MAX_TOKEN_PER_LINE: usize = 25;
409
410 fn compute_spdx_offset(
411 tokens: &[QueryToken],
412 dictionary: &crate::license_detection::index::dictionary::TokenDictionary,
413 ) -> Option<usize> {
414 let get_known_id = |i: usize| -> Option<TokenId> {
415 match tokens.get(i)? {
416 QueryToken::Known(known) => Some(known.id),
417 _ => None,
418 }
419 };
420
421 let spdx_id = dictionary.get("spdx")?;
422 let license_id = dictionary.get("license")?;
423 let identifier_id = dictionary.get("identifier")?;
424 let licence_id = dictionary.get("licence");
425
426 let licenses_id = dictionary.get("licenses");
427 let nuget_id = dictionary.get("nuget");
428 let org_id = dictionary.get("org");
429
430 let is_spdx_prefix = |ids: [Option<TokenId>; 3]| -> bool {
431 ids.iter().all(|id| id.is_some())
432 && ids[0] == Some(spdx_id)
433 && (ids[1] == Some(license_id) || ids[1] == licence_id)
434 && ids[2] == Some(identifier_id)
435 };
436
437 let is_nuget_prefix = |ids: [Option<TokenId>; 3]| -> bool {
438 licenses_id.is_some()
439 && nuget_id.is_some()
440 && org_id.is_some()
441 && ids[0] == licenses_id
442 && ids[1] == Some(nuget_id.unwrap())
443 && ids[2] == Some(org_id.unwrap())
444 };
445
446 if tokens.len() >= 3 {
447 let first_three = [get_known_id(0), get_known_id(1), get_known_id(2)];
448 if is_spdx_prefix(first_three) || is_nuget_prefix(first_three) {
449 return Some(0);
450 }
451 }
452
453 if tokens.len() >= 4 {
454 let second_three = [get_known_id(1), get_known_id(2), get_known_id(3)];
455 if is_spdx_prefix(second_three) || is_nuget_prefix(second_three) {
456 return Some(1);
457 }
458 }
459
460 if tokens.len() >= 5 {
461 let third_three = [get_known_id(2), get_known_id(3), get_known_id(4)];
462 if is_spdx_prefix(third_three) || is_nuget_prefix(third_three) {
463 return Some(2);
464 }
465 }
466
467 None
468 }
469
470 pub fn from_extracted_text(
471 text: &str,
472 index: &'a LicenseIndex,
473 binary_derived: bool,
474 ) -> Result<Self, anyhow::Error> {
475 let line_threshold = if binary_derived {
476 Self::BINARY_LINE_THRESHOLD
477 } else {
478 Self::TEXT_LINE_THRESHOLD
479 };
480
481 Self::with_source_options(text, index, line_threshold, Some(binary_derived))
482 }
483
484 pub fn query_runs(&self) -> Vec<QueryRun<'_>> {
488 self.query_run_ranges
489 .iter()
490 .map(|&(start, end)| QueryRun::new(self, start, end))
491 .collect()
492 }
493
494 fn with_source_options(
495 text: &str,
496 index: &'a LicenseIndex,
497 line_threshold: usize,
498 binary_derived: Option<bool>,
499 ) -> Result<Self, anyhow::Error> {
500 let is_binary = match binary_derived {
501 Some(is_binary) => is_binary,
502 None => Self::detect_binary(text)?,
503 };
504 let has_long_lines = Self::detect_long_lines(text);
505
506 let mut tokens = Vec::new();
507 let mut line_by_pos = Vec::new();
508 let mut unknowns_by_pos: HashMap<Option<i32>, usize> = HashMap::new();
509 let mut stopwords_by_pos: HashMap<Option<i32>, usize> = HashMap::new();
510 let mut shorts_and_digits_pos = HashSet::new();
511 let mut spdx_lines: Vec<(String, usize, usize)> = Vec::new();
512
513 let mut known_pos = -1i32;
514 let mut started = false;
515 let mut current_line = 1usize;
516
517 let mut tokens_by_line: Vec<Vec<Option<KnownToken>>> = Vec::new();
518
519 for line in text.lines() {
520 let line_trimmed = line.trim();
521 let mut line_tokens: Vec<Option<KnownToken>> = Vec::new();
522
523 let mut line_first_known_pos = None;
524
525 let line_query_tokens = tokenize_as_ids(line_trimmed, &index.dictionary);
526
527 for query_token in &line_query_tokens {
528 match query_token {
529 QueryToken::Known(known_token) => {
530 known_pos += 1;
531 started = true;
532 tokens.push(known_token.id);
533 line_by_pos.push(current_line);
534 line_tokens.push(Some(*known_token));
535
536 if line_first_known_pos.is_none() {
537 line_first_known_pos = Some(known_pos);
538 }
539
540 if known_token.is_short_or_digit {
541 let _ = shorts_and_digits_pos.insert(known_pos as usize);
542 }
543 }
544 QueryToken::Unknown if !started => {
545 *unknowns_by_pos.entry(None).or_insert(0) += 1;
546 line_tokens.push(None);
547 }
548 QueryToken::Unknown => {
549 *unknowns_by_pos.entry(Some(known_pos)).or_insert(0) += 1;
550 line_tokens.push(None);
551 }
552 QueryToken::Stopword if !started => {
553 *stopwords_by_pos.entry(None).or_insert(0) += 1;
554 }
555 QueryToken::Stopword => {
556 *stopwords_by_pos.entry(Some(known_pos)).or_insert(0) += 1;
557 }
558 }
559 }
560
561 let line_last_known_pos = known_pos;
562
563 let spdx_start_offset =
564 Self::compute_spdx_offset(&line_query_tokens, &index.dictionary);
565
566 if let Some(offset) = spdx_start_offset
567 && let Some(line_first_known_pos) = line_first_known_pos
568 {
569 let spdx_start_known_pos = line_first_known_pos + offset as i32;
570 if spdx_start_known_pos <= line_last_known_pos {
571 let spdx_start = spdx_start_known_pos as usize;
572 let spdx_end = (line_last_known_pos + 1) as usize;
573 spdx_lines.push((line_trimmed.to_string(), spdx_start, spdx_end));
574 }
575 }
576
577 tokens_by_line.push(line_tokens);
578 current_line += 1;
579 }
580
581 let high_matchables: BitSet = tokens
582 .iter()
583 .enumerate()
584 .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Legalese)
585 .map(|(pos, _tid)| pos)
586 .collect();
587
588 let low_matchables: BitSet = tokens
589 .iter()
590 .enumerate()
591 .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Regular)
592 .map(|(pos, _tid)| pos)
593 .collect();
594
595 let query_runs = Self::compute_query_runs(&tokens_by_line, line_threshold, has_long_lines);
596
597 Ok(Query {
598 text: text.to_string(),
599 tokens,
600 line_by_pos,
601 unknowns_by_pos,
602 stopwords_by_pos,
603 shorts_and_digits_pos,
604 high_matchables,
605 low_matchables,
606 is_binary,
607 query_run_ranges: query_runs,
608 spdx_lines,
609 index,
610 })
611 }
612
613 fn detect_binary(text: &str) -> Result<bool, anyhow::Error> {
627 let null_byte_count = text.bytes().filter(|&b| b == 0).count();
628
629 if null_byte_count > 0 {
630 return Ok(true);
631 }
632
633 let non_printable_ratio = text
634 .chars()
635 .filter(|&c| {
636 !c.is_ascii() && !c.is_ascii_graphic() && c != '\n' && c != '\r' && c != '\t'
637 })
638 .count() as f64
639 / text.len().max(1) as f64;
640
641 Ok(non_printable_ratio > 0.3)
642 }
643
644 fn detect_long_lines(text: &str) -> bool {
654 text.lines()
655 .any(|line| crate::license_detection::tokenize::count_tokens(line) > 25)
656 }
657
658 fn break_long_lines(lines: &[Vec<Option<KnownToken>>]) -> Vec<Vec<Option<KnownToken>>> {
659 lines
660 .iter()
661 .flat_map(|line| {
662 if line.is_empty() {
663 return Vec::new();
664 }
665
666 if line.len() <= Self::MAX_TOKEN_PER_LINE {
667 vec![line.clone()]
668 } else {
669 line.chunks(Self::MAX_TOKEN_PER_LINE)
670 .map(|chunk| chunk.to_vec())
671 .collect()
672 }
673 })
674 .collect()
675 }
676
677 fn compute_query_runs(
678 tokens_by_line: &[Vec<Option<KnownToken>>],
679 line_threshold: usize,
680 has_long_lines: bool,
681 ) -> Vec<(usize, Option<usize>)> {
682 let processed_lines = if has_long_lines {
683 Self::break_long_lines(tokens_by_line)
684 } else {
685 tokens_by_line.to_vec()
686 };
687
688 let mut query_runs = Vec::new();
689 let mut query_run_start = 0usize;
690 let mut query_run_end = None;
691 let mut empty_lines = 0usize;
692 let mut pos = 0usize;
693 let mut query_run_is_all_digit = true;
694
695 for line_tokens in processed_lines {
696 if query_run_end.is_some() && empty_lines >= line_threshold {
697 if !query_run_is_all_digit {
698 query_runs.push((query_run_start, query_run_end));
699 }
700 query_run_start = pos;
701 query_run_end = None;
702 empty_lines = 0;
703 query_run_is_all_digit = true;
704 }
705
706 if query_run_end.is_none() {
707 query_run_start = pos;
708 }
709
710 if line_tokens.is_empty() {
711 empty_lines += 1;
712 continue;
713 }
714
715 let line_is_all_digit = line_tokens
716 .iter()
717 .all(|token_id| token_id.map(|known| known.is_digit_only).unwrap_or(true));
718 let mut line_has_known_tokens = false;
719 let mut line_has_good_tokens = false;
720
721 for known in line_tokens.into_iter().flatten() {
722 line_has_known_tokens = true;
723 if known.kind == TokenKind::Legalese {
724 line_has_good_tokens = true;
725 }
726 if !known.is_digit_only {
727 query_run_is_all_digit = false;
728 }
729 query_run_end = Some(pos);
730 pos += 1;
731 }
732
733 if line_is_all_digit || !line_has_known_tokens {
734 empty_lines += 1;
735 continue;
736 }
737
738 if line_has_good_tokens {
739 empty_lines = 0;
740 } else {
741 empty_lines += 1;
742 }
743 }
744
745 if let Some(end) = query_run_end
746 && !query_run_is_all_digit
747 {
748 query_runs.push((query_run_start, Some(end)));
749 }
750
751 query_runs
752 }
753
754 #[inline]
764 pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
765 self.line_by_pos.get(pos).copied()
766 }
767
768 #[inline]
770 pub fn is_empty(&self) -> bool {
771 self.tokens.is_empty()
772 }
773
774 pub fn whole_query_run(&self) -> QueryRun<'a> {
778 QueryRun::whole_query_snapshot(self)
779 }
780
781 pub fn subtract(&mut self, span: &PositionSpan) {
790 for pos in span.iter() {
791 self.high_matchables.remove(pos);
792 self.low_matchables.remove(pos);
793 }
794 }
795
796 pub fn matched_text(&self, start_line: usize, end_line: usize) -> String {
810 matched_text_from_text(&self.text, start_line, end_line)
811 }
812}
813
814#[derive(Debug, Clone)]
815struct WholeQueryRunSnapshot<'a> {
816 index: &'a LicenseIndex,
817 tokens: Vec<TokenId>,
818 line_by_pos: Vec<usize>,
819 high_matchables: BitSet,
820 low_matchables: BitSet,
821}
822
823#[derive(Debug, Clone)]
831pub struct QueryRun<'a> {
832 query: Option<&'a Query<'a>>,
833 whole_query_snapshot: Option<WholeQueryRunSnapshot<'a>>,
834 pub start: usize,
835 pub end: Option<usize>,
836 cached_high_matchables: OnceCell<BitSet>,
837 cached_low_matchables: OnceCell<BitSet>,
838 combined_matchables: RefCell<Option<BitSet>>,
839}
840
841impl<'a> QueryRun<'a> {
842 pub fn new(query: &'a Query<'a>, start: usize, end: Option<usize>) -> Self {
851 Self {
852 query: Some(query),
853 whole_query_snapshot: None,
854 start,
855 end,
856 cached_high_matchables: OnceCell::new(),
857 cached_low_matchables: OnceCell::new(),
858 combined_matchables: RefCell::new(None),
859 }
860 }
861
862 fn whole_query_snapshot(query: &Query<'a>) -> Self {
863 let end = if query.is_empty() {
864 None
865 } else {
866 Some(query.tokens.len() - 1)
867 };
868
869 Self {
870 query: None,
871 whole_query_snapshot: Some(WholeQueryRunSnapshot {
872 index: query.index,
873 tokens: query.tokens.clone(),
874 line_by_pos: query.line_by_pos.clone(),
875 high_matchables: query.high_matchables.clone(),
876 low_matchables: query.low_matchables.clone(),
877 }),
878 start: 0,
879 end,
880 cached_high_matchables: OnceCell::new(),
881 cached_low_matchables: OnceCell::new(),
882 combined_matchables: RefCell::new(None),
883 }
884 }
885
886 fn source_tokens(&self) -> &[TokenId] {
887 if let Some(query) = self.query {
888 &query.tokens
889 } else {
890 &self
891 .whole_query_snapshot
892 .as_ref()
893 .expect("snapshot-backed whole query run should have snapshot data")
894 .tokens
895 }
896 }
897
898 fn source_line_by_pos(&self) -> &[usize] {
899 if let Some(query) = self.query {
900 &query.line_by_pos
901 } else {
902 &self
903 .whole_query_snapshot
904 .as_ref()
905 .expect("snapshot-backed whole query run should have snapshot data")
906 .line_by_pos
907 }
908 }
909
910 fn source_high_matchables(&self) -> &BitSet {
911 if let Some(query) = self.query {
912 &query.high_matchables
913 } else {
914 &self
915 .whole_query_snapshot
916 .as_ref()
917 .expect("snapshot-backed whole query run should have snapshot data")
918 .high_matchables
919 }
920 }
921
922 fn source_low_matchables(&self) -> &BitSet {
923 if let Some(query) = self.query {
924 &query.low_matchables
925 } else {
926 &self
927 .whole_query_snapshot
928 .as_ref()
929 .expect("snapshot-backed whole query run should have snapshot data")
930 .low_matchables
931 }
932 }
933
934 pub fn get_index(&self) -> &LicenseIndex {
936 if let Some(query) = self.query {
937 query.index
938 } else {
939 self.whole_query_snapshot
940 .as_ref()
941 .expect("snapshot-backed whole query run should have snapshot data")
942 .index
943 }
944 }
945
946 pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
954 self.source_line_by_pos().get(pos).copied()
955 }
956
957 pub fn tokens(&self) -> &[TokenId] {
963 match self.end {
964 Some(end) => &self.source_tokens()[self.start..=end],
965 None => &[],
966 }
967 }
968
969 pub fn tokens_with_pos(&self) -> impl Iterator<Item = (usize, TokenId)> + '_ {
973 self.tokens()
974 .iter()
975 .copied()
976 .enumerate()
977 .map(|(i, tid)| (self.start + i, tid))
978 }
979
980 pub fn is_digits_only(&self) -> bool {
984 self.tokens()
985 .iter()
986 .all(|&tid| self.get_index().dictionary.is_digit_only_token(tid))
987 }
988
989 pub fn is_matchable(&self, include_low: bool, exclude_positions: &[PositionSpan]) -> bool {
999 if self.is_digits_only() {
1000 return false;
1001 }
1002
1003 let matchables = self.matchables(include_low);
1004
1005 if exclude_positions.is_empty() {
1006 return !matchables.is_empty();
1007 }
1008
1009 let mut matchable_set = matchables;
1010 for span in exclude_positions {
1011 for pos in span.iter() {
1012 matchable_set.remove(pos);
1013 }
1014 }
1015
1016 !matchable_set.is_empty()
1017 }
1018
1019 pub fn matchables(&self, include_low: bool) -> BitSet {
1020 if include_low {
1021 if let Some(ref cached) = *self.combined_matchables.borrow() {
1022 return cached.clone();
1023 }
1024 let combined: BitSet = self
1025 .low_matchables()
1026 .union(&self.high_matchables())
1027 .collect();
1028 *self.combined_matchables.borrow_mut() = Some(combined.clone());
1029 combined
1030 } else {
1031 self.high_matchables()
1032 }
1033 }
1034
1035 pub fn matchable_tokens(&self) -> Vec<i32> {
1036 let high_matchables = self.high_matchables();
1037 if high_matchables.is_empty() {
1038 return Vec::new();
1039 }
1040
1041 let matchables = self.matchables(true);
1042 self.tokens_with_pos()
1043 .map(|(pos, tid)| {
1044 if matchables.contains(pos) {
1045 tid.raw() as i32
1046 } else {
1047 -1
1048 }
1049 })
1050 .collect()
1051 }
1052
1053 pub fn high_matchables(&self) -> BitSet {
1054 self.cached_high_matchables
1055 .get_or_init(|| {
1056 let start = self.start;
1057 let end = self.end;
1058 let source = self.source_high_matchables();
1059 let live_span = PositionSpan::new(start, end.unwrap_or(usize::MAX));
1060 source
1061 .iter()
1062 .filter(|&pos| live_span.contains(pos))
1063 .collect()
1064 })
1065 .clone()
1066 }
1067
1068 pub fn low_matchables(&self) -> BitSet {
1069 self.cached_low_matchables
1070 .get_or_init(|| {
1071 let start = self.start;
1072 let end = self.end;
1073 let source = self.source_low_matchables();
1074 let live_span = PositionSpan::new(start, end.unwrap_or(usize::MAX));
1075 source
1076 .iter()
1077 .filter(|&pos| live_span.contains(pos))
1078 .collect()
1079 })
1080 .clone()
1081 }
1082}
1083
1084#[cfg(test)]
1085mod test;