1use crate::license_detection::index::LicenseIndex;
7use crate::license_detection::index::dictionary::{KnownToken, QueryToken, TokenId, TokenKind};
8use crate::license_detection::models::PositionSpan;
9use crate::license_detection::position_set::PositionSet;
10use crate::license_detection::spdx_lid::split_spdx_lid;
11use crate::license_detection::tokenize::STOPWORDS;
12use crate::license_detection::tokenize::tokenize_as_ids;
13use regex::Regex;
14use std::cell::{OnceCell, RefCell};
15use std::collections::HashMap;
16use std::sync::LazyLock;
17use std::time::Instant;
18
19static QUERY_PATTERN: LazyLock<Regex> =
20 LazyLock::new(|| Regex::new(r"[^_\W]+\+?[^_\W]*").expect("valid query regex"));
21static MATCHED_TEXT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
22 Regex::new(r"(?P<token>[^_\W]+\+?[^_\W]*)|(?P<punct>[_\W\s\+]+[_\W\s]?)")
23 .expect("valid matched text regex")
24});
25
26#[derive(Clone)]
27struct MatchedTextToken {
28 value: String,
29 line_num: usize,
30 pos: Option<usize>,
31 is_text: bool,
32 is_matched: bool,
33}
34
35#[derive(Debug)]
47pub struct Query<'a> {
48 pub text: String,
52
53 pub tokens: Vec<TokenId>,
57
58 pub line_by_pos: Vec<usize>,
65
66 pub unknowns_by_pos: HashMap<Option<usize>, usize>,
74
75 pub stopwords_by_pos: HashMap<Option<usize>, usize>,
81
82 pub shorts_and_digits_pos: PositionSet,
88
89 pub high_matchables: PositionSet,
95
96 pub low_matchables: PositionSet,
102
103 pub is_binary: bool,
107
108 pub(crate) query_run_ranges: Vec<(usize, Option<usize>)>,
114
115 pub spdx_lines: Vec<(String, usize, usize)>,
122
123 pub index: &'a LicenseIndex,
125}
126
127pub fn matched_text_from_text(text: &str, start_line: usize, end_line: usize) -> String {
128 if start_line == 0 || end_line == 0 || start_line > end_line {
129 return String::new();
130 }
131
132 text.lines()
133 .enumerate()
134 .filter_map(|(idx, line)| {
135 let line_num = idx + 1;
136 if line_num >= start_line && line_num <= end_line {
137 Some(line)
138 } else {
139 None
140 }
141 })
142 .collect::<Vec<_>>()
143 .join("\n")
144}
145
146pub fn matched_text_diagnostics_from_text(
147 text: &str,
148 query: &Query<'_>,
149 matched_positions: &PositionSet,
150 start_pos: usize,
151 end_pos: usize,
152 start_line: usize,
153 end_line: usize,
154) -> String {
155 let tokens = tokenize_matched_text(text, query);
156 let reportable_tokens = collect_reportable_tokens(
157 tokens,
158 matched_positions,
159 start_pos,
160 end_pos,
161 start_line,
162 end_line,
163 );
164 let line_endings = collect_line_endings(text);
165
166 render_diagnostic_tokens(&reportable_tokens, &line_endings)
167}
168
169pub fn matched_text_from_tokens(
179 text: &str,
180 query: &Query<'_>,
181 matched_positions: &PositionSet,
182 start_pos: usize,
183 end_pos: usize,
184 start_line: usize,
185 end_line: usize,
186) -> String {
187 let tokens = tokenize_matched_text(text, query);
188 let reportable_tokens = collect_reportable_tokens(
189 tokens,
190 matched_positions,
191 start_pos,
192 end_pos,
193 start_line,
194 end_line,
195 );
196 let line_endings = collect_line_endings(text);
197
198 render_plain_tokens(&reportable_tokens, &line_endings)
199}
200
201fn render_plain_tokens(tokens: &[MatchedTextToken], line_endings: &[String]) -> String {
202 let mut rendered = String::new();
203 let mut previous_line: Option<usize> = None;
204
205 for token in tokens {
206 if let Some(prev_line) = previous_line
207 && token.line_num > prev_line
208 {
209 for line in prev_line..token.line_num {
210 if let Some(line_ending) = line_endings.get(line.saturating_sub(1)) {
211 rendered.push_str(line_ending.as_str());
212 }
213 }
214 }
215
216 let token_value = if token.is_text {
217 token.value.as_str()
218 } else {
219 token
220 .value
221 .strip_suffix("\r\n")
222 .or_else(|| token.value.strip_suffix('\n'))
223 .unwrap_or(token.value.as_str())
224 };
225
226 rendered.push_str(token_value);
227
228 previous_line = Some(token.line_num);
229 }
230
231 rendered
232}
233
234fn tokenize_matched_text(text: &str, query: &Query<'_>) -> Vec<MatchedTextToken> {
235 let mut tokens = Vec::new();
236 let mut pos = 0usize;
237 for (line_num, line) in (1usize..).zip(text.split_inclusive('\n')) {
238 for capture in MATCHED_TEXT_PATTERN.captures_iter(line) {
239 if let Some(token_match) = capture.name("token") {
240 let token_text = token_match.as_str();
241 let retokenized: Vec<String> = QUERY_PATTERN
242 .find_iter(&token_text.to_lowercase())
243 .map(|m| m.as_str().to_string())
244 .filter(|token| !STOPWORDS.contains(token.as_str()))
245 .collect();
246
247 if retokenized.is_empty() {
248 tokens.push(MatchedTextToken {
249 value: token_text.to_string(),
250 line_num,
251 pos: None,
252 is_text: true,
253 is_matched: false,
254 });
255 } else if retokenized.len() == 1 {
256 let token = &retokenized[0];
257 let token_pos = if query.index.dictionary.get(token).is_some() {
258 let current_pos = pos;
259 pos += 1;
260 Some(current_pos)
261 } else {
262 None
263 };
264
265 tokens.push(MatchedTextToken {
266 value: token_text.to_string(),
267 line_num,
268 pos: token_pos,
269 is_text: true,
270 is_matched: false,
271 });
272 } else {
273 for token in retokenized {
274 let token_pos = if query.index.dictionary.get(&token).is_some() {
275 let current_pos = pos;
276 pos += 1;
277 Some(current_pos)
278 } else {
279 None
280 };
281
282 tokens.push(MatchedTextToken {
283 value: token,
284 line_num,
285 pos: token_pos,
286 is_text: true,
287 is_matched: false,
288 });
289 }
290 }
291 } else if let Some(punct_match) = capture.name("punct") {
292 tokens.push(MatchedTextToken {
293 value: punct_match.as_str().to_string(),
294 line_num,
295 pos: None,
296 is_text: false,
297 is_matched: false,
298 });
299 }
300 }
301 }
302
303 tokens
304}
305
306fn collect_reportable_tokens(
307 tokens: Vec<MatchedTextToken>,
308 matched_positions: &PositionSet,
309 start_pos: usize,
310 end_pos: usize,
311 start_line: usize,
312 end_line: usize,
313) -> Vec<MatchedTextToken> {
314 let mut reportable = Vec::new();
315 let mut started = false;
316 let mut finished = false;
317 let mut end_real_pos = None;
318 let mut last_real_pos = None;
319
320 for (real_pos, mut token) in tokens.into_iter().enumerate() {
321 if token.line_num < start_line {
322 continue;
323 }
324
325 if token.line_num > end_line {
326 break;
327 }
328
329 let mut is_included = false;
330
331 if token.pos.is_some_and(|pos| matched_positions.contains(pos)) {
332 token.is_matched = true;
333 is_included = true;
334 }
335
336 if !started && token.pos == Some(start_pos) {
337 started = true;
338 is_included = true;
339 }
340
341 if started && !finished {
342 is_included = true;
343 }
344
345 if token.pos == Some(end_pos) {
346 finished = true;
347 started = false;
348 end_real_pos = Some(real_pos);
349 }
350
351 if finished && !started && end_real_pos.is_some() && last_real_pos == end_real_pos {
352 end_real_pos = None;
353 if !token.is_text && !token.value.trim().is_empty() {
354 is_included = true;
355 }
356 }
357
358 last_real_pos = Some(real_pos);
359
360 if is_included {
361 reportable.push(token);
362 }
363 }
364
365 reportable
366}
367
368fn collect_line_endings(text: &str) -> Vec<String> {
369 text.split_inclusive('\n')
370 .map(|line| {
371 if line.ends_with("\r\n") {
372 "\r\n".to_string()
373 } else if line.ends_with('\n') {
374 "\n".to_string()
375 } else {
376 String::new()
377 }
378 })
379 .collect()
380}
381
382fn render_diagnostic_tokens(tokens: &[MatchedTextToken], line_endings: &[String]) -> String {
383 let mut rendered = String::new();
384 let mut previous_line: Option<usize> = None;
385
386 for token in tokens {
387 if let Some(prev_line) = previous_line
388 && token.line_num > prev_line
389 {
390 for line in prev_line..token.line_num {
391 if let Some(line_ending) = line_endings.get(line.saturating_sub(1)) {
392 rendered.push_str(line_ending.as_str());
393 }
394 }
395 }
396
397 let token_value = if token.is_text {
398 token.value.as_str()
399 } else {
400 token
401 .value
402 .strip_suffix("\r\n")
403 .or_else(|| token.value.strip_suffix('\n'))
404 .unwrap_or(token.value.as_str())
405 };
406
407 if token.is_text && !STOPWORDS.contains(token.value.to_lowercase().as_str()) {
408 if token.is_matched {
409 rendered.push_str(token_value);
410 } else {
411 rendered.push('[');
412 rendered.push_str(token_value);
413 rendered.push(']');
414 }
415 } else {
416 rendered.push_str(token_value);
417 }
418
419 previous_line = Some(token.line_num);
420 }
421
422 rendered
423}
424
425impl<'a> Query<'a> {
426 const TEXT_LINE_THRESHOLD: usize = 15;
441 const BINARY_LINE_THRESHOLD: usize = 50;
442 const MAX_TOKEN_PER_LINE: usize = 25;
443
444 fn compute_spdx_offset(
445 tokens: &[QueryToken],
446 dictionary: &crate::license_detection::index::dictionary::TokenDictionary,
447 ) -> Option<usize> {
448 let get_known_id = |i: usize| -> Option<TokenId> {
449 match tokens.get(i)? {
450 QueryToken::Known(known) => Some(known.id),
451 _ => None,
452 }
453 };
454
455 let spdx_id = dictionary.get("spdx")?;
456 let license_id = dictionary.get("license")?;
457 let identifier_id = dictionary.get("identifier")?;
458 let licence_id = dictionary.get("licence");
459
460 let licenses_id = dictionary.get("licenses");
461 let nuget_id = dictionary.get("nuget");
462 let org_id = dictionary.get("org");
463
464 let is_spdx_prefix = |ids: [Option<TokenId>; 3]| -> bool {
465 ids.iter().all(|id| id.is_some())
466 && ids[0] == Some(spdx_id)
467 && (ids[1] == Some(license_id) || ids[1] == licence_id)
468 && ids[2] == Some(identifier_id)
469 };
470
471 let is_nuget_prefix = |ids: [Option<TokenId>; 3]| -> bool {
472 licenses_id.is_some()
473 && nuget_id.is_some()
474 && org_id.is_some()
475 && ids[0] == licenses_id
476 && ids[1] == Some(nuget_id.unwrap())
477 && ids[2] == Some(org_id.unwrap())
478 };
479
480 if tokens.len() >= 3 {
481 let first_three = [get_known_id(0), get_known_id(1), get_known_id(2)];
482 if is_spdx_prefix(first_three) || is_nuget_prefix(first_three) {
483 return Some(0);
484 }
485 }
486
487 if tokens.len() >= 4 {
488 let second_three = [get_known_id(1), get_known_id(2), get_known_id(3)];
489 if is_spdx_prefix(second_three) || is_nuget_prefix(second_three) {
490 return Some(1);
491 }
492 }
493
494 if tokens.len() >= 5 {
495 let third_three = [get_known_id(2), get_known_id(3), get_known_id(4)];
496 if is_spdx_prefix(third_three) || is_nuget_prefix(third_three) {
497 return Some(2);
498 }
499 }
500
501 None
502 }
503
504 pub fn from_extracted_text(
505 text: &str,
506 index: &'a LicenseIndex,
507 binary_derived: bool,
508 ) -> Result<Self, anyhow::Error> {
509 Self::from_extracted_text_with_deadline(text, index, binary_derived, None)
510 }
511
512 pub fn from_extracted_text_with_deadline(
513 text: &str,
514 index: &'a LicenseIndex,
515 binary_derived: bool,
516 deadline: Option<Instant>,
517 ) -> Result<Self, anyhow::Error> {
518 let line_threshold = if binary_derived {
519 Self::BINARY_LINE_THRESHOLD
520 } else {
521 Self::TEXT_LINE_THRESHOLD
522 };
523
524 Self::with_source_options(text, index, line_threshold, Some(binary_derived), deadline)
525 }
526
527 pub fn query_runs(&self) -> Vec<QueryRun<'_>> {
531 self.query_run_ranges
532 .iter()
533 .map(|&(start, end)| QueryRun::new(self, start, end))
534 .collect()
535 }
536
537 fn with_source_options(
538 text: &str,
539 index: &'a LicenseIndex,
540 line_threshold: usize,
541 binary_derived: Option<bool>,
542 deadline: Option<Instant>,
543 ) -> Result<Self, anyhow::Error> {
544 crate::license_detection::ensure_within_deadline(deadline)?;
545 let is_binary = match binary_derived {
546 Some(is_binary) => is_binary,
547 None => Self::detect_binary(text)?,
548 };
549 let has_long_lines = Self::detect_long_lines(text);
550
551 let mut tokens = Vec::new();
552 let mut line_by_pos = Vec::new();
553 let mut unknowns_by_pos: HashMap<Option<usize>, usize> = HashMap::new();
554 let mut stopwords_by_pos: HashMap<Option<usize>, usize> = HashMap::new();
555 let mut shorts_and_digits_pos = PositionSet::new();
556 let mut spdx_lines: Vec<(String, usize, usize)> = Vec::new();
557
558 let mut known_pos: Option<usize> = None;
559 let mut started = false;
560 let mut tokens_by_line: Vec<Vec<Option<KnownToken>>> = Vec::new();
561
562 for (current_line, (line_index, line)) in (1usize..).zip(text.lines().enumerate()) {
563 if line_index.is_multiple_of(128) {
564 crate::license_detection::ensure_within_deadline(deadline)?;
565 }
566
567 let line_trimmed = line.trim();
568 let mut line_tokens: Vec<Option<KnownToken>> = Vec::new();
569
570 let mut line_first_known_pos = None;
571
572 let line_query_tokens = tokenize_as_ids(line_trimmed, &index.dictionary);
573
574 for query_token in &line_query_tokens {
575 match query_token {
576 QueryToken::Known(known_token) => {
577 known_pos = Some(known_pos.map_or(0, |p| p + 1));
578 started = true;
579 tokens.push(known_token.id);
580 line_by_pos.push(current_line);
581 line_tokens.push(Some(*known_token));
582
583 if line_first_known_pos.is_none() {
584 line_first_known_pos = known_pos;
585 }
586
587 if known_token.is_short_or_digit {
588 let _ = shorts_and_digits_pos.insert(known_pos.unwrap());
589 }
590 }
591 QueryToken::Unknown if !started => {
592 *unknowns_by_pos.entry(None).or_insert(0) += 1;
593 line_tokens.push(None);
594 }
595 QueryToken::Unknown => {
596 *unknowns_by_pos.entry(known_pos).or_insert(0) += 1;
597 line_tokens.push(None);
598 }
599 QueryToken::Stopword if !started => {
600 *stopwords_by_pos.entry(None).or_insert(0) += 1;
601 }
602 QueryToken::Stopword => {
603 *stopwords_by_pos.entry(known_pos).or_insert(0) += 1;
604 }
605 }
606 }
607
608 let line_last_known_pos = known_pos;
609
610 let spdx_start_offset =
611 Self::compute_spdx_offset(&line_query_tokens, &index.dictionary);
612
613 if let Some(offset) = spdx_start_offset
614 && let Some(line_first_known_pos) = line_first_known_pos
615 {
616 let (spdx_prefix, spdx_expression) = split_spdx_lid(line);
617 let spdx_text = format!("{}{}", spdx_prefix.unwrap_or_default(), spdx_expression);
618 let spdx_start_known_pos = line_first_known_pos + offset;
619
620 if spdx_start_known_pos <= line_last_known_pos.unwrap() {
621 let spdx_end = line_last_known_pos.unwrap() + 1;
622 spdx_lines.push((spdx_text, spdx_start_known_pos, spdx_end));
623 }
624 }
625 tokens_by_line.push(line_tokens);
626 }
627
628 crate::license_detection::ensure_within_deadline(deadline)?;
629
630 let high_matchables: PositionSet = tokens
631 .iter()
632 .enumerate()
633 .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Legalese)
634 .map(|(pos, _tid)| pos)
635 .collect();
636
637 let low_matchables: PositionSet = tokens
638 .iter()
639 .enumerate()
640 .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Regular)
641 .map(|(pos, _tid)| pos)
642 .collect();
643
644 let query_runs = Self::compute_query_runs(&tokens_by_line, line_threshold, has_long_lines);
645
646 Ok(Query {
647 text: text.to_string(),
648 tokens,
649 line_by_pos,
650 unknowns_by_pos,
651 stopwords_by_pos,
652 shorts_and_digits_pos,
653 high_matchables,
654 low_matchables,
655 is_binary,
656 query_run_ranges: query_runs,
657 spdx_lines,
658 index,
659 })
660 }
661
662 fn detect_binary(text: &str) -> Result<bool, anyhow::Error> {
676 let null_byte_count = text.bytes().filter(|&b| b == 0).count();
677
678 if null_byte_count > 0 {
679 return Ok(true);
680 }
681
682 let non_printable_ratio = text
683 .chars()
684 .filter(|&c| {
685 !c.is_ascii() && !c.is_ascii_graphic() && c != '\n' && c != '\r' && c != '\t'
686 })
687 .count() as f64
688 / text.len().max(1) as f64;
689
690 Ok(non_printable_ratio > 0.3)
691 }
692
693 fn detect_long_lines(text: &str) -> bool {
703 text.lines()
704 .any(|line| crate::license_detection::tokenize::count_tokens(line) > 25)
705 }
706
707 fn break_long_lines(lines: &[Vec<Option<KnownToken>>]) -> Vec<Vec<Option<KnownToken>>> {
708 lines
709 .iter()
710 .flat_map(|line| {
711 if line.is_empty() {
712 return Vec::new();
713 }
714
715 if line.len() <= Self::MAX_TOKEN_PER_LINE {
716 vec![line.clone()]
717 } else {
718 line.chunks(Self::MAX_TOKEN_PER_LINE)
719 .map(|chunk| chunk.to_vec())
720 .collect()
721 }
722 })
723 .collect()
724 }
725
726 fn compute_query_runs(
727 tokens_by_line: &[Vec<Option<KnownToken>>],
728 line_threshold: usize,
729 has_long_lines: bool,
730 ) -> Vec<(usize, Option<usize>)> {
731 let processed_lines = if has_long_lines {
732 Self::break_long_lines(tokens_by_line)
733 } else {
734 tokens_by_line.to_vec()
735 };
736
737 let mut query_runs = Vec::new();
738 let mut query_run_start = 0usize;
739 let mut query_run_end = None;
740 let mut empty_lines = 0usize;
741 let mut pos = 0usize;
742 let mut query_run_is_all_digit = true;
743
744 for line_tokens in processed_lines {
745 if query_run_end.is_some() && empty_lines >= line_threshold {
746 if !query_run_is_all_digit {
747 query_runs.push((query_run_start, query_run_end));
748 }
749 query_run_start = pos;
750 query_run_end = None;
751 empty_lines = 0;
752 query_run_is_all_digit = true;
753 }
754
755 if query_run_end.is_none() {
756 query_run_start = pos;
757 }
758
759 if line_tokens.is_empty() {
760 empty_lines += 1;
761 continue;
762 }
763
764 let line_is_all_digit = line_tokens
765 .iter()
766 .all(|token_id| token_id.map(|known| known.is_digit_only).unwrap_or(true));
767 let mut line_has_known_tokens = false;
768 let mut line_has_good_tokens = false;
769
770 for known in line_tokens.into_iter().flatten() {
771 line_has_known_tokens = true;
772 if known.kind == TokenKind::Legalese {
773 line_has_good_tokens = true;
774 }
775 if !known.is_digit_only {
776 query_run_is_all_digit = false;
777 }
778 query_run_end = Some(pos);
779 pos += 1;
780 }
781
782 if line_is_all_digit || !line_has_known_tokens {
783 empty_lines += 1;
784 continue;
785 }
786
787 if line_has_good_tokens {
788 empty_lines = 0;
789 } else {
790 empty_lines += 1;
791 }
792 }
793
794 if let Some(end) = query_run_end
795 && !query_run_is_all_digit
796 {
797 query_runs.push((query_run_start, Some(end)));
798 }
799
800 query_runs
801 }
802
803 #[inline]
813 pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
814 self.line_by_pos.get(pos).copied()
815 }
816
817 #[inline]
819 pub fn is_empty(&self) -> bool {
820 self.tokens.is_empty()
821 }
822
823 pub fn whole_query_run(&self) -> QueryRun<'a> {
827 QueryRun::whole_query_snapshot(self)
828 }
829
830 pub fn subtract(&mut self, span: &PositionSpan) {
839 self.high_matchables.remove_span(span);
840 self.low_matchables.remove_span(span);
841 }
842
843 pub fn matched_text(&self, start_line: usize, end_line: usize) -> String {
857 matched_text_from_text(&self.text, start_line, end_line)
858 }
859}
860
861#[derive(Debug, Clone)]
862struct WholeQueryRunSnapshot<'a> {
863 index: &'a LicenseIndex,
864 tokens: Vec<TokenId>,
865 line_by_pos: Vec<usize>,
866 high_matchables: PositionSet,
867 low_matchables: PositionSet,
868}
869
870#[derive(Debug, Clone)]
878pub struct QueryRun<'a> {
879 query: Option<&'a Query<'a>>,
880 whole_query_snapshot: Option<WholeQueryRunSnapshot<'a>>,
881 pub start: usize,
882 pub end: Option<usize>,
883 cached_high_matchables: OnceCell<PositionSet>,
884 cached_low_matchables: OnceCell<PositionSet>,
885 combined_matchables: RefCell<Option<PositionSet>>,
886}
887
888impl<'a> QueryRun<'a> {
889 pub fn new(query: &'a Query<'a>, start: usize, end: Option<usize>) -> Self {
898 Self {
899 query: Some(query),
900 whole_query_snapshot: None,
901 start,
902 end,
903 cached_high_matchables: OnceCell::new(),
904 cached_low_matchables: OnceCell::new(),
905 combined_matchables: RefCell::new(None),
906 }
907 }
908
909 fn whole_query_snapshot(query: &Query<'a>) -> Self {
910 let end = if query.is_empty() {
911 None
912 } else {
913 Some(query.tokens.len() - 1)
914 };
915
916 Self {
917 query: None,
918 whole_query_snapshot: Some(WholeQueryRunSnapshot {
919 index: query.index,
920 tokens: query.tokens.clone(),
921 line_by_pos: query.line_by_pos.clone(),
922 high_matchables: query.high_matchables.clone(),
923 low_matchables: query.low_matchables.clone(),
924 }),
925 start: 0,
926 end,
927 cached_high_matchables: OnceCell::new(),
928 cached_low_matchables: OnceCell::new(),
929 combined_matchables: RefCell::new(None),
930 }
931 }
932
933 fn source_tokens(&self) -> &[TokenId] {
934 if let Some(query) = self.query {
935 &query.tokens
936 } else {
937 &self
938 .whole_query_snapshot
939 .as_ref()
940 .expect("snapshot-backed whole query run should have snapshot data")
941 .tokens
942 }
943 }
944
945 fn source_line_by_pos(&self) -> &[usize] {
946 if let Some(query) = self.query {
947 &query.line_by_pos
948 } else {
949 &self
950 .whole_query_snapshot
951 .as_ref()
952 .expect("snapshot-backed whole query run should have snapshot data")
953 .line_by_pos
954 }
955 }
956
957 fn source_high_matchables(&self) -> &PositionSet {
958 if let Some(query) = self.query {
959 &query.high_matchables
960 } else {
961 &self
962 .whole_query_snapshot
963 .as_ref()
964 .expect("snapshot-backed whole query run should have snapshot data")
965 .high_matchables
966 }
967 }
968
969 fn source_low_matchables(&self) -> &PositionSet {
970 if let Some(query) = self.query {
971 &query.low_matchables
972 } else {
973 &self
974 .whole_query_snapshot
975 .as_ref()
976 .expect("snapshot-backed whole query run should have snapshot data")
977 .low_matchables
978 }
979 }
980
981 pub fn get_index(&self) -> &LicenseIndex {
983 if let Some(query) = self.query {
984 query.index
985 } else {
986 self.whole_query_snapshot
987 .as_ref()
988 .expect("snapshot-backed whole query run should have snapshot data")
989 .index
990 }
991 }
992
993 pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
1001 self.source_line_by_pos().get(pos).copied()
1002 }
1003
1004 pub fn tokens(&self) -> &[TokenId] {
1010 match self.end {
1011 Some(end) => &self.source_tokens()[self.start..=end],
1012 None => &[],
1013 }
1014 }
1015
1016 pub fn tokens_with_pos(&self) -> impl Iterator<Item = (usize, TokenId)> + '_ {
1020 self.tokens()
1021 .iter()
1022 .copied()
1023 .enumerate()
1024 .map(|(i, tid)| (self.start + i, tid))
1025 }
1026
1027 pub fn is_digits_only(&self) -> bool {
1031 self.tokens()
1032 .iter()
1033 .all(|&tid| self.get_index().dictionary.is_digit_only_token(tid))
1034 }
1035
1036 pub fn is_matchable(&self, include_low: bool, exclude_positions: &[PositionSpan]) -> bool {
1046 if self.is_digits_only() {
1047 return false;
1048 }
1049
1050 let matchables = self.matchables(include_low);
1051
1052 if exclude_positions.is_empty() {
1053 return !matchables.is_empty();
1054 }
1055
1056 let mut matchable_set = matchables;
1057 for span in exclude_positions {
1058 matchable_set.remove_span(span);
1059 }
1060
1061 !matchable_set.is_empty()
1062 }
1063
1064 pub fn matchables(&self, include_low: bool) -> PositionSet {
1065 if include_low {
1066 if let Some(ref cached) = *self.combined_matchables.borrow() {
1067 return cached.clone();
1068 }
1069 let combined = self.low_matchables().union(&self.high_matchables());
1070 *self.combined_matchables.borrow_mut() = Some(combined.clone());
1071 combined
1072 } else {
1073 self.high_matchables()
1074 }
1075 }
1076
1077 pub fn matchable_tokens(&self) -> Vec<Option<TokenId>> {
1078 let high_matchables = self.high_matchables();
1079 if high_matchables.is_empty() {
1080 return Vec::new();
1081 }
1082
1083 let matchables = self.matchables(true);
1084 self.tokens_with_pos()
1085 .map(|(pos, tid)| {
1086 if matchables.contains(pos) {
1087 Some(tid)
1088 } else {
1089 None
1090 }
1091 })
1092 .collect()
1093 }
1094
1095 pub fn high_matchables(&self) -> PositionSet {
1096 self.cached_high_matchables
1097 .get_or_init(|| {
1098 let start = self.start;
1099 let end = self.end.map(|e| e + 1).unwrap_or(usize::MAX);
1100 let source = self.source_high_matchables();
1101 let live_span = PositionSpan::new(start, end);
1102 source
1103 .iter()
1104 .filter(|&pos| live_span.contains(pos))
1105 .collect()
1106 })
1107 .clone()
1108 }
1109
1110 pub fn low_matchables(&self) -> PositionSet {
1111 self.cached_low_matchables
1112 .get_or_init(|| {
1113 let start = self.start;
1114 let end = self.end.map(|e| e + 1).unwrap_or(usize::MAX);
1115 let source = self.source_low_matchables();
1116 let live_span = PositionSpan::new(start, end);
1117 source
1118 .iter()
1119 .filter(|&pos| live_span.contains(pos))
1120 .collect()
1121 })
1122 .clone()
1123 }
1124}
1125
1126#[cfg(test)]
1127mod test;