1use crate::license_detection::LicenseDetectionError;
7use crate::license_detection::index::LicenseIndex;
8use crate::license_detection::index::dictionary::{KnownToken, QueryToken, TokenId, TokenKind};
9use crate::license_detection::models::PositionSpan;
10use crate::license_detection::position_set::PositionSet;
11use crate::license_detection::spdx_lid::split_spdx_lid;
12use crate::license_detection::tokenize::STOPWORDS;
13use crate::license_detection::tokenize::tokenize_as_ids;
14use regex::Regex;
15use std::cell::{OnceCell, RefCell};
16use std::collections::HashMap;
17use std::sync::LazyLock;
18use std::time::Instant;
19
20static QUERY_PATTERN: LazyLock<Regex> =
21 LazyLock::new(|| Regex::new(r"[^_\W]+\+?[^_\W]*").expect("valid query regex"));
22static MATCHED_TEXT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
23 Regex::new(r"(?P<token>[^_\W]+\+?[^_\W]*)|(?P<punct>[_\W\s\+]+[_\W\s]?)")
24 .expect("valid matched text regex")
25});
26
27#[derive(Clone)]
28struct MatchedTextToken {
29 value: String,
30 line_num: usize,
31 pos: Option<usize>,
32 is_text: bool,
33 is_matched: bool,
34}
35
36#[derive(Debug)]
48pub struct Query<'a> {
49 pub text: String,
53
54 pub tokens: Vec<TokenId>,
58
59 pub line_by_pos: Vec<usize>,
66
67 pub unknowns_by_pos: HashMap<Option<usize>, usize>,
75
76 pub stopwords_by_pos: HashMap<Option<usize>, usize>,
82
83 pub shorts_and_digits_pos: PositionSet,
89
90 pub high_matchables: PositionSet,
96
97 pub low_matchables: PositionSet,
103
104 pub is_binary: bool,
108
109 pub(crate) query_run_ranges: Vec<(usize, Option<usize>)>,
115
116 pub spdx_lines: Vec<(String, usize, usize)>,
123
124 pub index: &'a LicenseIndex,
126}
127
128pub fn matched_text_from_text(text: &str, start_line: usize, end_line: usize) -> String {
129 if start_line == 0 || end_line == 0 || start_line > end_line {
130 return String::new();
131 }
132
133 text.lines()
134 .enumerate()
135 .filter_map(|(idx, line)| {
136 let line_num = idx + 1;
137 if line_num >= start_line && line_num <= end_line {
138 Some(line)
139 } else {
140 None
141 }
142 })
143 .collect::<Vec<_>>()
144 .join("\n")
145}
146
147pub fn matched_text_diagnostics_from_text(
148 text: &str,
149 query: &Query<'_>,
150 matched_positions: &PositionSet,
151 start_pos: usize,
152 end_pos: usize,
153 start_line: usize,
154 end_line: usize,
155) -> String {
156 let tokens = tokenize_matched_text(text, query);
157 let reportable_tokens = collect_reportable_tokens(
158 tokens,
159 matched_positions,
160 start_pos,
161 end_pos,
162 start_line,
163 end_line,
164 );
165 let line_endings = collect_line_endings(text);
166
167 render_diagnostic_tokens(&reportable_tokens, &line_endings)
168}
169
170pub fn matched_text_from_tokens(
180 text: &str,
181 query: &Query<'_>,
182 matched_positions: &PositionSet,
183 start_pos: usize,
184 end_pos: usize,
185 start_line: usize,
186 end_line: usize,
187) -> String {
188 let tokens = tokenize_matched_text(text, query);
189 let reportable_tokens = collect_reportable_tokens(
190 tokens,
191 matched_positions,
192 start_pos,
193 end_pos,
194 start_line,
195 end_line,
196 );
197 let line_endings = collect_line_endings(text);
198
199 render_plain_tokens(&reportable_tokens, &line_endings)
200}
201
202fn render_plain_tokens(tokens: &[MatchedTextToken], line_endings: &[String]) -> String {
203 let mut rendered = String::new();
204 let mut previous_line: Option<usize> = None;
205
206 for token in tokens {
207 if let Some(prev_line) = previous_line
208 && token.line_num > prev_line
209 {
210 for line in prev_line..token.line_num {
211 if let Some(line_ending) = line_endings.get(line.saturating_sub(1)) {
212 rendered.push_str(line_ending.as_str());
213 }
214 }
215 }
216
217 let token_value = if token.is_text {
218 token.value.as_str()
219 } else {
220 token
221 .value
222 .strip_suffix("\r\n")
223 .or_else(|| token.value.strip_suffix('\n'))
224 .unwrap_or(token.value.as_str())
225 };
226
227 rendered.push_str(token_value);
228
229 previous_line = Some(token.line_num);
230 }
231
232 rendered
233}
234
235fn tokenize_matched_text(text: &str, query: &Query<'_>) -> Vec<MatchedTextToken> {
236 let mut tokens = Vec::new();
237 let mut pos = 0usize;
238 for (line_num, line) in (1usize..).zip(text.split_inclusive('\n')) {
239 for capture in MATCHED_TEXT_PATTERN.captures_iter(line) {
240 if let Some(token_match) = capture.name("token") {
241 let token_text = token_match.as_str();
242 let retokenized: Vec<String> = QUERY_PATTERN
243 .find_iter(&token_text.to_lowercase())
244 .map(|m| m.as_str().to_string())
245 .filter(|token| !STOPWORDS.contains(token.as_str()))
246 .collect();
247
248 if retokenized.is_empty() {
249 tokens.push(MatchedTextToken {
250 value: token_text.to_string(),
251 line_num,
252 pos: None,
253 is_text: true,
254 is_matched: false,
255 });
256 } else if retokenized.len() == 1 {
257 let token = &retokenized[0];
258 let token_pos = if query.index.dictionary.get(token).is_some() {
259 let current_pos = pos;
260 pos += 1;
261 Some(current_pos)
262 } else {
263 None
264 };
265
266 tokens.push(MatchedTextToken {
267 value: token_text.to_string(),
268 line_num,
269 pos: token_pos,
270 is_text: true,
271 is_matched: false,
272 });
273 } else {
274 for token in retokenized {
275 let token_pos = if query.index.dictionary.get(&token).is_some() {
276 let current_pos = pos;
277 pos += 1;
278 Some(current_pos)
279 } else {
280 None
281 };
282
283 tokens.push(MatchedTextToken {
284 value: token,
285 line_num,
286 pos: token_pos,
287 is_text: true,
288 is_matched: false,
289 });
290 }
291 }
292 } else if let Some(punct_match) = capture.name("punct") {
293 tokens.push(MatchedTextToken {
294 value: punct_match.as_str().to_string(),
295 line_num,
296 pos: None,
297 is_text: false,
298 is_matched: false,
299 });
300 }
301 }
302 }
303
304 tokens
305}
306
307fn collect_reportable_tokens(
308 tokens: Vec<MatchedTextToken>,
309 matched_positions: &PositionSet,
310 start_pos: usize,
311 end_pos: usize,
312 start_line: usize,
313 end_line: usize,
314) -> Vec<MatchedTextToken> {
315 let mut reportable = Vec::new();
316 let mut started = false;
317 let mut finished = false;
318 let mut end_real_pos = None;
319 let mut last_real_pos = None;
320
321 for (real_pos, mut token) in tokens.into_iter().enumerate() {
322 if token.line_num < start_line {
323 continue;
324 }
325
326 if token.line_num > end_line {
327 break;
328 }
329
330 let mut is_included = false;
331
332 if token.pos.is_some_and(|pos| matched_positions.contains(pos)) {
333 token.is_matched = true;
334 is_included = true;
335 }
336
337 if !started && token.pos == Some(start_pos) {
338 started = true;
339 is_included = true;
340 }
341
342 if started && !finished {
343 is_included = true;
344 }
345
346 if token.pos == Some(end_pos) {
347 finished = true;
348 started = false;
349 end_real_pos = Some(real_pos);
350 }
351
352 if finished && !started && end_real_pos.is_some() && last_real_pos == end_real_pos {
353 end_real_pos = None;
354 if !token.is_text && !token.value.trim().is_empty() {
355 is_included = true;
356 }
357 }
358
359 last_real_pos = Some(real_pos);
360
361 if is_included {
362 reportable.push(token);
363 }
364 }
365
366 reportable
367}
368
369fn collect_line_endings(text: &str) -> Vec<String> {
370 text.split_inclusive('\n')
371 .map(|line| {
372 if line.ends_with("\r\n") {
373 "\r\n".to_string()
374 } else if line.ends_with('\n') {
375 "\n".to_string()
376 } else {
377 String::new()
378 }
379 })
380 .collect()
381}
382
383fn render_diagnostic_tokens(tokens: &[MatchedTextToken], line_endings: &[String]) -> String {
384 let mut rendered = String::new();
385 let mut previous_line: Option<usize> = None;
386
387 for token in tokens {
388 if let Some(prev_line) = previous_line
389 && token.line_num > prev_line
390 {
391 for line in prev_line..token.line_num {
392 if let Some(line_ending) = line_endings.get(line.saturating_sub(1)) {
393 rendered.push_str(line_ending.as_str());
394 }
395 }
396 }
397
398 let token_value = if token.is_text {
399 token.value.as_str()
400 } else {
401 token
402 .value
403 .strip_suffix("\r\n")
404 .or_else(|| token.value.strip_suffix('\n'))
405 .unwrap_or(token.value.as_str())
406 };
407
408 if token.is_text && !STOPWORDS.contains(token.value.to_lowercase().as_str()) {
409 if token.is_matched {
410 rendered.push_str(token_value);
411 } else {
412 rendered.push('[');
413 rendered.push_str(token_value);
414 rendered.push(']');
415 }
416 } else {
417 rendered.push_str(token_value);
418 }
419
420 previous_line = Some(token.line_num);
421 }
422
423 rendered
424}
425
426impl<'a> Query<'a> {
427 const TEXT_LINE_THRESHOLD: usize = 15;
442 const BINARY_LINE_THRESHOLD: usize = 50;
443 const MAX_TOKEN_PER_LINE: usize = 25;
444
445 fn compute_spdx_offset(
446 tokens: &[QueryToken],
447 dictionary: &crate::license_detection::index::dictionary::TokenDictionary,
448 ) -> Option<usize> {
449 let get_known_id = |i: usize| -> Option<TokenId> {
450 match tokens.get(i)? {
451 QueryToken::Known(known) => Some(known.id),
452 _ => None,
453 }
454 };
455
456 let spdx_id = dictionary.get("spdx")?;
457 let license_id = dictionary.get("license")?;
458 let identifier_id = dictionary.get("identifier")?;
459 let licence_id = dictionary.get("licence");
460
461 let licenses_id = dictionary.get("licenses");
462 let nuget_id = dictionary.get("nuget");
463 let org_id = dictionary.get("org");
464
465 let is_spdx_prefix = |ids: [Option<TokenId>; 3]| -> bool {
466 ids.iter().all(|id| id.is_some())
467 && ids[0] == Some(spdx_id)
468 && (ids[1] == Some(license_id) || ids[1] == licence_id)
469 && ids[2] == Some(identifier_id)
470 };
471
472 let is_nuget_prefix = |ids: [Option<TokenId>; 3]| -> bool {
473 licenses_id.is_some()
474 && nuget_id.is_some()
475 && org_id.is_some()
476 && ids[0] == licenses_id
477 && ids[1] == Some(nuget_id.unwrap())
478 && ids[2] == Some(org_id.unwrap())
479 };
480
481 if tokens.len() >= 3 {
482 let first_three = [get_known_id(0), get_known_id(1), get_known_id(2)];
483 if is_spdx_prefix(first_three) || is_nuget_prefix(first_three) {
484 return Some(0);
485 }
486 }
487
488 if tokens.len() >= 4 {
489 let second_three = [get_known_id(1), get_known_id(2), get_known_id(3)];
490 if is_spdx_prefix(second_three) || is_nuget_prefix(second_three) {
491 return Some(1);
492 }
493 }
494
495 if tokens.len() >= 5 {
496 let third_three = [get_known_id(2), get_known_id(3), get_known_id(4)];
497 if is_spdx_prefix(third_three) || is_nuget_prefix(third_three) {
498 return Some(2);
499 }
500 }
501
502 None
503 }
504
505 pub(crate) fn from_extracted_text(
506 text: &str,
507 index: &'a LicenseIndex,
508 binary_derived: bool,
509 ) -> Result<Self, LicenseDetectionError> {
510 Self::from_extracted_text_with_deadline(text, index, binary_derived, None)
511 }
512
513 pub(crate) fn from_extracted_text_with_deadline(
514 text: &str,
515 index: &'a LicenseIndex,
516 binary_derived: bool,
517 deadline: Option<Instant>,
518 ) -> Result<Self, LicenseDetectionError> {
519 let line_threshold = if binary_derived {
520 Self::BINARY_LINE_THRESHOLD
521 } else {
522 Self::TEXT_LINE_THRESHOLD
523 };
524
525 Self::with_source_options(text, index, line_threshold, Some(binary_derived), deadline)
526 }
527
528 pub fn query_runs(&self) -> Vec<QueryRun<'_>> {
532 self.query_run_ranges
533 .iter()
534 .map(|&(start, end)| QueryRun::new(self, start, end))
535 .collect()
536 }
537
538 fn with_source_options(
539 text: &str,
540 index: &'a LicenseIndex,
541 line_threshold: usize,
542 binary_derived: Option<bool>,
543 deadline: Option<Instant>,
544 ) -> Result<Self, LicenseDetectionError> {
545 crate::license_detection::ensure_within_deadline(deadline)?;
546 let is_binary = match binary_derived {
547 Some(is_binary) => is_binary,
548 None => Self::detect_binary(text),
549 };
550 let has_long_lines = Self::detect_long_lines(text);
551
552 let mut tokens = Vec::new();
553 let mut line_by_pos = Vec::new();
554 let mut unknowns_by_pos: HashMap<Option<usize>, usize> = HashMap::new();
555 let mut stopwords_by_pos: HashMap<Option<usize>, usize> = HashMap::new();
556 let mut shorts_and_digits_pos = PositionSet::new();
557 let mut spdx_lines: Vec<(String, usize, usize)> = Vec::new();
558
559 let mut known_pos: Option<usize> = None;
560 let mut started = false;
561 let mut tokens_by_line: Vec<Vec<Option<KnownToken>>> = Vec::new();
562
563 for (current_line, (line_index, line)) in (1usize..).zip(text.lines().enumerate()) {
564 if line_index.is_multiple_of(128) {
565 crate::license_detection::ensure_within_deadline(deadline)?;
566 }
567
568 let line_trimmed = line.trim();
569 let mut line_tokens: Vec<Option<KnownToken>> = Vec::new();
570
571 let mut line_first_known_pos = None;
572
573 let line_query_tokens = tokenize_as_ids(line_trimmed, &index.dictionary);
574
575 for query_token in &line_query_tokens {
576 match query_token {
577 QueryToken::Known(known_token) => {
578 known_pos = Some(known_pos.map_or(0, |p| p + 1));
579 started = true;
580 tokens.push(known_token.id);
581 line_by_pos.push(current_line);
582 line_tokens.push(Some(*known_token));
583
584 if line_first_known_pos.is_none() {
585 line_first_known_pos = known_pos;
586 }
587
588 if known_token.is_short_or_digit {
589 let _ = shorts_and_digits_pos.insert(known_pos.unwrap());
590 }
591 }
592 QueryToken::Unknown if !started => {
593 *unknowns_by_pos.entry(None).or_insert(0) += 1;
594 line_tokens.push(None);
595 }
596 QueryToken::Unknown => {
597 *unknowns_by_pos.entry(known_pos).or_insert(0) += 1;
598 line_tokens.push(None);
599 }
600 QueryToken::Stopword if !started => {
601 *stopwords_by_pos.entry(None).or_insert(0) += 1;
602 }
603 QueryToken::Stopword => {
604 *stopwords_by_pos.entry(known_pos).or_insert(0) += 1;
605 }
606 }
607 }
608
609 let line_last_known_pos = known_pos;
610
611 let spdx_start_offset =
612 Self::compute_spdx_offset(&line_query_tokens, &index.dictionary);
613
614 if let Some(offset) = spdx_start_offset
615 && let Some(line_first_known_pos) = line_first_known_pos
616 {
617 let (spdx_prefix, spdx_expression) = split_spdx_lid(line);
618 let spdx_text = format!("{}{}", spdx_prefix.unwrap_or_default(), spdx_expression);
619 let spdx_start_known_pos = line_first_known_pos + offset;
620
621 if spdx_start_known_pos <= line_last_known_pos.unwrap() {
622 let spdx_end = line_last_known_pos.unwrap() + 1;
623 spdx_lines.push((spdx_text, spdx_start_known_pos, spdx_end));
624 }
625 }
626 tokens_by_line.push(line_tokens);
627 }
628
629 crate::license_detection::ensure_within_deadline(deadline)?;
630
631 let high_matchables: PositionSet = tokens
632 .iter()
633 .enumerate()
634 .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Legalese)
635 .map(|(pos, _tid)| pos)
636 .collect();
637
638 let low_matchables: PositionSet = tokens
639 .iter()
640 .enumerate()
641 .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Regular)
642 .map(|(pos, _tid)| pos)
643 .collect();
644
645 let query_runs = Self::compute_query_runs(&tokens_by_line, line_threshold, has_long_lines);
646
647 Ok(Query {
648 text: text.to_string(),
649 tokens,
650 line_by_pos,
651 unknowns_by_pos,
652 stopwords_by_pos,
653 shorts_and_digits_pos,
654 high_matchables,
655 low_matchables,
656 is_binary,
657 query_run_ranges: query_runs,
658 spdx_lines,
659 index,
660 })
661 }
662
663 fn detect_binary(text: &str) -> bool {
677 let null_byte_count = text.bytes().filter(|&b| b == 0).count();
678
679 if null_byte_count > 0 {
680 return true;
681 }
682
683 let non_printable_ratio = text
684 .chars()
685 .filter(|&c| {
686 !c.is_ascii() && !c.is_ascii_graphic() && c != '\n' && c != '\r' && c != '\t'
687 })
688 .count() as f64
689 / text.len().max(1) as f64;
690
691 non_printable_ratio > 0.3
692 }
693
694 fn detect_long_lines(text: &str) -> bool {
704 text.lines()
705 .any(|line| crate::license_detection::tokenize::count_tokens(line) > 25)
706 }
707
708 fn break_long_lines(lines: &[Vec<Option<KnownToken>>]) -> Vec<Vec<Option<KnownToken>>> {
709 lines
710 .iter()
711 .flat_map(|line| {
712 if line.is_empty() {
713 return Vec::new();
714 }
715
716 if line.len() <= Self::MAX_TOKEN_PER_LINE {
717 vec![line.clone()]
718 } else {
719 line.chunks(Self::MAX_TOKEN_PER_LINE)
720 .map(|chunk| chunk.to_vec())
721 .collect()
722 }
723 })
724 .collect()
725 }
726
727 fn compute_query_runs(
728 tokens_by_line: &[Vec<Option<KnownToken>>],
729 line_threshold: usize,
730 has_long_lines: bool,
731 ) -> Vec<(usize, Option<usize>)> {
732 let processed_lines = if has_long_lines {
733 Self::break_long_lines(tokens_by_line)
734 } else {
735 tokens_by_line.to_vec()
736 };
737
738 let mut query_runs = Vec::new();
739 let mut query_run_start = 0usize;
740 let mut query_run_end = None;
741 let mut empty_lines = 0usize;
742 let mut pos = 0usize;
743 let mut query_run_is_all_digit = true;
744
745 for line_tokens in processed_lines {
746 if query_run_end.is_some() && empty_lines >= line_threshold {
747 if !query_run_is_all_digit {
748 query_runs.push((query_run_start, query_run_end));
749 }
750 query_run_start = pos;
751 query_run_end = None;
752 empty_lines = 0;
753 query_run_is_all_digit = true;
754 }
755
756 if query_run_end.is_none() {
757 query_run_start = pos;
758 }
759
760 if line_tokens.is_empty() {
761 empty_lines += 1;
762 continue;
763 }
764
765 let line_is_all_digit = line_tokens
766 .iter()
767 .all(|token_id| token_id.map(|known| known.is_digit_only).unwrap_or(true));
768 let mut line_has_known_tokens = false;
769 let mut line_has_good_tokens = false;
770
771 for known in line_tokens.into_iter().flatten() {
772 line_has_known_tokens = true;
773 if known.kind == TokenKind::Legalese {
774 line_has_good_tokens = true;
775 }
776 if !known.is_digit_only {
777 query_run_is_all_digit = false;
778 }
779 query_run_end = Some(pos);
780 pos += 1;
781 }
782
783 if line_is_all_digit || !line_has_known_tokens {
784 empty_lines += 1;
785 continue;
786 }
787
788 if line_has_good_tokens {
789 empty_lines = 0;
790 } else {
791 empty_lines += 1;
792 }
793 }
794
795 if let Some(end) = query_run_end
796 && !query_run_is_all_digit
797 {
798 query_runs.push((query_run_start, Some(end)));
799 }
800
801 query_runs
802 }
803
804 #[inline]
814 pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
815 self.line_by_pos.get(pos).copied()
816 }
817
818 #[inline]
820 pub fn is_empty(&self) -> bool {
821 self.tokens.is_empty()
822 }
823
824 pub fn whole_query_run(&self) -> QueryRun<'a> {
828 QueryRun::whole_query_snapshot(self)
829 }
830
831 pub fn subtract(&mut self, span: &PositionSpan) {
840 self.high_matchables.remove_span(span);
841 self.low_matchables.remove_span(span);
842 }
843
844 pub fn matched_text(&self, start_line: usize, end_line: usize) -> String {
858 matched_text_from_text(&self.text, start_line, end_line)
859 }
860}
861
862#[derive(Debug, Clone)]
863struct WholeQueryRunSnapshot<'a> {
864 index: &'a LicenseIndex,
865 tokens: Vec<TokenId>,
866 line_by_pos: Vec<usize>,
867 high_matchables: PositionSet,
868 low_matchables: PositionSet,
869}
870
871#[derive(Debug, Clone)]
879pub struct QueryRun<'a> {
880 query: Option<&'a Query<'a>>,
881 whole_query_snapshot: Option<WholeQueryRunSnapshot<'a>>,
882 pub start: usize,
883 pub end: Option<usize>,
884 cached_high_matchables: OnceCell<PositionSet>,
885 cached_low_matchables: OnceCell<PositionSet>,
886 combined_matchables: RefCell<Option<PositionSet>>,
887}
888
889impl<'a> QueryRun<'a> {
890 pub fn new(query: &'a Query<'a>, start: usize, end: Option<usize>) -> Self {
899 Self {
900 query: Some(query),
901 whole_query_snapshot: None,
902 start,
903 end,
904 cached_high_matchables: OnceCell::new(),
905 cached_low_matchables: OnceCell::new(),
906 combined_matchables: RefCell::new(None),
907 }
908 }
909
910 fn whole_query_snapshot(query: &Query<'a>) -> Self {
911 let end = if query.is_empty() {
912 None
913 } else {
914 Some(query.tokens.len() - 1)
915 };
916
917 Self {
918 query: None,
919 whole_query_snapshot: Some(WholeQueryRunSnapshot {
920 index: query.index,
921 tokens: query.tokens.clone(),
922 line_by_pos: query.line_by_pos.clone(),
923 high_matchables: query.high_matchables.clone(),
924 low_matchables: query.low_matchables.clone(),
925 }),
926 start: 0,
927 end,
928 cached_high_matchables: OnceCell::new(),
929 cached_low_matchables: OnceCell::new(),
930 combined_matchables: RefCell::new(None),
931 }
932 }
933
934 fn source_tokens(&self) -> &[TokenId] {
935 if let Some(query) = self.query {
936 &query.tokens
937 } else {
938 &self
939 .whole_query_snapshot
940 .as_ref()
941 .expect("snapshot-backed whole query run should have snapshot data")
942 .tokens
943 }
944 }
945
946 fn source_line_by_pos(&self) -> &[usize] {
947 if let Some(query) = self.query {
948 &query.line_by_pos
949 } else {
950 &self
951 .whole_query_snapshot
952 .as_ref()
953 .expect("snapshot-backed whole query run should have snapshot data")
954 .line_by_pos
955 }
956 }
957
958 fn source_high_matchables(&self) -> &PositionSet {
959 if let Some(query) = self.query {
960 &query.high_matchables
961 } else {
962 &self
963 .whole_query_snapshot
964 .as_ref()
965 .expect("snapshot-backed whole query run should have snapshot data")
966 .high_matchables
967 }
968 }
969
970 fn source_low_matchables(&self) -> &PositionSet {
971 if let Some(query) = self.query {
972 &query.low_matchables
973 } else {
974 &self
975 .whole_query_snapshot
976 .as_ref()
977 .expect("snapshot-backed whole query run should have snapshot data")
978 .low_matchables
979 }
980 }
981
982 pub fn get_index(&self) -> &LicenseIndex {
984 if let Some(query) = self.query {
985 query.index
986 } else {
987 self.whole_query_snapshot
988 .as_ref()
989 .expect("snapshot-backed whole query run should have snapshot data")
990 .index
991 }
992 }
993
994 pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
1002 self.source_line_by_pos().get(pos).copied()
1003 }
1004
1005 pub fn tokens(&self) -> &[TokenId] {
1011 match self.end {
1012 Some(end) => &self.source_tokens()[self.start..=end],
1013 None => &[],
1014 }
1015 }
1016
1017 pub fn tokens_with_pos(&self) -> impl Iterator<Item = (usize, TokenId)> + '_ {
1021 self.tokens()
1022 .iter()
1023 .copied()
1024 .enumerate()
1025 .map(|(i, tid)| (self.start + i, tid))
1026 }
1027
1028 pub fn is_digits_only(&self) -> bool {
1032 self.tokens()
1033 .iter()
1034 .all(|&tid| self.get_index().dictionary.is_digit_only_token(tid))
1035 }
1036
1037 pub fn is_matchable(&self, include_low: bool, exclude_positions: &[PositionSpan]) -> bool {
1047 if self.is_digits_only() {
1048 return false;
1049 }
1050
1051 let matchables = self.matchables(include_low);
1052
1053 if exclude_positions.is_empty() {
1054 return !matchables.is_empty();
1055 }
1056
1057 let mut matchable_set = matchables;
1058 for span in exclude_positions {
1059 matchable_set.remove_span(span);
1060 }
1061
1062 !matchable_set.is_empty()
1063 }
1064
1065 pub fn matchables(&self, include_low: bool) -> PositionSet {
1066 if include_low {
1067 if let Some(ref cached) = *self.combined_matchables.borrow() {
1068 return cached.clone();
1069 }
1070 let combined = self.low_matchables().union(&self.high_matchables());
1071 *self.combined_matchables.borrow_mut() = Some(combined.clone());
1072 combined
1073 } else {
1074 self.high_matchables()
1075 }
1076 }
1077
1078 pub fn matchable_tokens(&self) -> Vec<Option<TokenId>> {
1079 let high_matchables = self.high_matchables();
1080 if high_matchables.is_empty() {
1081 return Vec::new();
1082 }
1083
1084 let matchables = self.matchables(true);
1085 self.tokens_with_pos()
1086 .map(|(pos, tid)| {
1087 if matchables.contains(pos) {
1088 Some(tid)
1089 } else {
1090 None
1091 }
1092 })
1093 .collect()
1094 }
1095
1096 pub fn high_matchables(&self) -> PositionSet {
1097 self.cached_high_matchables
1098 .get_or_init(|| {
1099 let start = self.start;
1100 let end = self.end.map(|e| e + 1).unwrap_or(usize::MAX);
1101 let source = self.source_high_matchables();
1102 let live_span = PositionSpan::new(start, end);
1103 source
1104 .iter()
1105 .filter(|&pos| live_span.contains(pos))
1106 .collect()
1107 })
1108 .clone()
1109 }
1110
1111 pub fn low_matchables(&self) -> PositionSet {
1112 self.cached_low_matchables
1113 .get_or_init(|| {
1114 let start = self.start;
1115 let end = self.end.map(|e| e + 1).unwrap_or(usize::MAX);
1116 let source = self.source_low_matchables();
1117 let live_span = PositionSpan::new(start, end);
1118 source
1119 .iter()
1120 .filter(|&pos| live_span.contains(pos))
1121 .collect()
1122 })
1123 .clone()
1124 }
1125}
1126
1127#[cfg(test)]
1128mod test;