provenant/license_detection/query/
mod.rs

1//! Query processing - tokenized input for license matching.
2
3use crate::license_detection::index::LicenseIndex;
4use crate::license_detection::index::dictionary::{KnownToken, QueryToken, TokenId, TokenKind};
5use crate::license_detection::tokenize::STOPWORDS;
6use crate::license_detection::tokenize::tokenize_as_ids;
7use bit_set::BitSet;
8use once_cell::sync::Lazy;
9use regex::Regex;
10use std::cell::{OnceCell, RefCell};
11use std::collections::{HashMap, HashSet};
12
13static QUERY_PATTERN: Lazy<Regex> =
14    Lazy::new(|| Regex::new(r"[^_\W]+\+?[^_\W]*").expect("valid query regex"));
15static MATCHED_TEXT_PATTERN: Lazy<Regex> = Lazy::new(|| {
16    Regex::new(r"(?P<token>[^_\W]+\+?[^_\W]*)|(?P<punct>[_\W\s\+]+[_\W\s]?)")
17        .expect("valid matched text regex")
18});
19
20#[derive(Clone)]
21struct MatchedTextToken {
22    value: String,
23    line_num: usize,
24    pos: Option<usize>,
25    is_text: bool,
26    is_matched: bool,
27}
28
29/// A span representing a range of token positions.
30///
31/// Used for tracking matched token positions and performing position arithmetic.
32/// This is a single continuous range of token positions (start..=end, inclusive).
33///
34/// Distinct from `spans::Span` which tracks multiple byte ranges for coverage.
35///
36/// Based on Python Span class at:
37/// reference/scancode-toolkit/src/licensedcode/spans.py
38#[derive(Debug, Clone)]
39pub struct PositionSpan {
40    start: usize,
41    end: usize,
42}
43
44impl PositionSpan {
45    pub fn new(start: usize, end: usize) -> Self {
46        Self { start, end }
47    }
48
49    pub fn contains(&self, pos: usize) -> bool {
50        self.start <= pos && pos <= self.end
51    }
52
53    pub fn iter(&self) -> impl Iterator<Item = usize> + '_ {
54        self.start..=self.end
55    }
56}
57
58///
59/// Query holds:
60/// - Known token IDs (tokens existing in the index dictionary)
61/// - Token positions and their corresponding line numbers (line_by_pos)
62/// - Unknown tokens (tokens not in dictionary) tracked per position
63/// - Stopwords tracked per position
64/// - Positions with short/digit-only tokens
65/// - High and low matchable token positions (for tracking what's been matched)
66///
67/// Based on Python Query class at:
68/// reference/scancode-toolkit/src/licensedcode/query.py (lines 155-295)
69#[derive(Debug)]
70pub struct Query<'a> {
71    /// The original input text.
72    ///
73    /// Corresponds to Python: `self.query_string` (line 215)
74    pub text: String,
75
76    /// Token IDs for known tokens (tokens found in the index dictionary)
77    ///
78    /// Corresponds to Python: `self.tokens = []` (line 228)
79    pub tokens: Vec<TokenId>,
80
81    /// Mapping from token position to line number (1-based)
82    ///
83    /// Each token position in `self.tokens` maps to the line number where it appears.
84    /// This is used for match position reporting.
85    ///
86    /// Corresponds to Python: `self.line_by_pos = []` (line 231)
87    pub line_by_pos: Vec<usize>,
88
89    /// Mapping from token position to count of unknown tokens after that position
90    ///
91    /// Unknown tokens are those not found in the dictionary. We track them by
92    /// counting how many unknown tokens appear after each known position.
93    /// Unknown tokens before the first known token are tracked at position -1
94    /// (using the key `None` in Rust).
95    ///
96    /// Corresponds to Python: `self.unknowns_by_pos = {}` (line 236)
97    pub unknowns_by_pos: HashMap<Option<i32>, usize>,
98
99    /// Mapping from token position to count of stopwords after that position
100    ///
101    /// Similar to unknown_tokens, but for stopwords.
102    ///
103    /// Corresponds to Python: `self.stopwords_by_pos = {}` (line 244)
104    pub stopwords_by_pos: HashMap<Option<i32>, usize>,
105
106    /// Set of positions with single-character or digit-only tokens
107    ///
108    /// These tokens have special handling in matching.
109    ///
110    /// Corresponds to Python: `self.shorts_and_digits_pos = set()` (line 249)
111    pub shorts_and_digits_pos: HashSet<usize>,
112
113    /// High-value matchable token positions (legalese tokens)
114    ///
115    /// These are tokens with ID < len_legalese.
116    ///
117    /// Corresponds to Python: `self.high_matchables` (line 293)
118    pub high_matchables: BitSet,
119
120    /// Low-value matchable token positions (non-legalese tokens)
121    ///
122    /// These are tokens with ID >= len_legalese.
123    ///
124    /// Corresponds to Python: `self.low_matchables` (line 294)
125    pub low_matchables: BitSet,
126
127    /// True if the query is detected as binary content
128    ///
129    /// Corresponds to Python: `self.is_binary = False` (line 225)
130    pub is_binary: bool,
131
132    /// Raw query run ranges (start, end) computed during tokenization.
133    ///
134    /// QueryRuns are created on-demand from these ranges.
135    ///
136    /// Corresponds to Python: `self.query_runs = []` (line 274)
137    pub(crate) query_run_ranges: Vec<(usize, Option<usize>)>,
138
139    /// SPDX-License-Identifier lines found during tokenization.
140    ///
141    /// Each tuple is (spdx_text, start_token_pos, end_token_pos).
142    /// Used for creating LicenseMatches with correct token positions.
143    ///
144    /// Corresponds to Python: `self.spdx_lines = []` (line 507)
145    pub spdx_lines: Vec<(String, usize, usize)>,
146
147    /// Reference to the license index for dictionary access and metadata
148    pub index: &'a LicenseIndex,
149}
150
151pub fn matched_text_from_text(text: &str, start_line: usize, end_line: usize) -> String {
152    if start_line == 0 || end_line == 0 || start_line > end_line {
153        return String::new();
154    }
155
156    text.lines()
157        .enumerate()
158        .filter_map(|(idx, line)| {
159            let line_num = idx + 1;
160            if line_num >= start_line && line_num <= end_line {
161                Some(line)
162            } else {
163                None
164            }
165        })
166        .collect::<Vec<_>>()
167        .join("\n")
168}
169
170pub fn matched_text_diagnostics_from_text(
171    text: &str,
172    query: &Query<'_>,
173    matched_positions: &HashSet<usize>,
174    start_pos: usize,
175    end_pos: usize,
176    start_line: usize,
177    end_line: usize,
178) -> String {
179    let tokens = tokenize_matched_text(text, query);
180    let reportable_tokens = collect_reportable_tokens(
181        tokens,
182        matched_positions,
183        start_pos,
184        end_pos,
185        start_line,
186        end_line,
187    );
188    let line_endings = collect_line_endings(text);
189
190    render_diagnostic_tokens(&reportable_tokens, &line_endings)
191}
192
193fn tokenize_matched_text(text: &str, query: &Query<'_>) -> Vec<MatchedTextToken> {
194    let mut tokens = Vec::new();
195    let mut pos = 0usize;
196    let mut line_num = 1usize;
197
198    for line in text.split_inclusive('\n') {
199        for capture in MATCHED_TEXT_PATTERN.captures_iter(line) {
200            if let Some(token_match) = capture.name("token") {
201                let token_text = token_match.as_str();
202                let retokenized: Vec<String> = QUERY_PATTERN
203                    .find_iter(&token_text.to_lowercase())
204                    .map(|m| m.as_str().to_string())
205                    .filter(|token| !STOPWORDS.contains(token.as_str()))
206                    .collect();
207
208                if retokenized.is_empty() {
209                    tokens.push(MatchedTextToken {
210                        value: token_text.to_string(),
211                        line_num,
212                        pos: None,
213                        is_text: true,
214                        is_matched: false,
215                    });
216                } else if retokenized.len() == 1 {
217                    let token = &retokenized[0];
218                    let token_pos = if query.index.dictionary.get(token).is_some() {
219                        let current_pos = pos;
220                        pos += 1;
221                        Some(current_pos)
222                    } else {
223                        None
224                    };
225
226                    tokens.push(MatchedTextToken {
227                        value: token_text.to_string(),
228                        line_num,
229                        pos: token_pos,
230                        is_text: true,
231                        is_matched: false,
232                    });
233                } else {
234                    for token in retokenized {
235                        let token_pos = if query.index.dictionary.get(&token).is_some() {
236                            let current_pos = pos;
237                            pos += 1;
238                            Some(current_pos)
239                        } else {
240                            None
241                        };
242
243                        tokens.push(MatchedTextToken {
244                            value: token,
245                            line_num,
246                            pos: token_pos,
247                            is_text: true,
248                            is_matched: false,
249                        });
250                    }
251                }
252            } else if let Some(punct_match) = capture.name("punct") {
253                tokens.push(MatchedTextToken {
254                    value: punct_match.as_str().to_string(),
255                    line_num,
256                    pos: None,
257                    is_text: false,
258                    is_matched: false,
259                });
260            }
261        }
262
263        line_num += 1;
264    }
265
266    tokens
267}
268
269fn collect_reportable_tokens(
270    tokens: Vec<MatchedTextToken>,
271    matched_positions: &HashSet<usize>,
272    start_pos: usize,
273    end_pos: usize,
274    start_line: usize,
275    end_line: usize,
276) -> Vec<MatchedTextToken> {
277    let mut reportable = Vec::new();
278    let mut started = false;
279    let mut finished = false;
280    let mut end_real_pos = None;
281    let mut last_real_pos = None;
282
283    for (real_pos, mut token) in tokens.into_iter().enumerate() {
284        if token.line_num < start_line {
285            continue;
286        }
287
288        if token.line_num > end_line {
289            break;
290        }
291
292        let mut is_included = false;
293
294        if token
295            .pos
296            .is_some_and(|pos| matched_positions.contains(&pos))
297        {
298            token.is_matched = true;
299            is_included = true;
300        }
301
302        if !started && token.pos == Some(start_pos) {
303            started = true;
304            is_included = true;
305        }
306
307        if started && !finished {
308            is_included = true;
309        }
310
311        if token.pos == Some(end_pos) {
312            finished = true;
313            started = false;
314            end_real_pos = Some(real_pos);
315        }
316
317        if finished && !started && end_real_pos.is_some() && last_real_pos == end_real_pos {
318            end_real_pos = None;
319            if !token.is_text && !token.value.trim().is_empty() {
320                is_included = true;
321            }
322        }
323
324        last_real_pos = Some(real_pos);
325
326        if is_included {
327            reportable.push(token);
328        }
329    }
330
331    reportable
332}
333
334fn collect_line_endings(text: &str) -> Vec<String> {
335    text.split_inclusive('\n')
336        .map(|line| {
337            if line.ends_with("\r\n") {
338                "\r\n".to_string()
339            } else if line.ends_with('\n') {
340                "\n".to_string()
341            } else {
342                String::new()
343            }
344        })
345        .collect()
346}
347
348fn render_diagnostic_tokens(tokens: &[MatchedTextToken], line_endings: &[String]) -> String {
349    let mut rendered = String::new();
350    let mut previous_line: Option<usize> = None;
351
352    for token in tokens {
353        if let Some(prev_line) = previous_line
354            && token.line_num > prev_line
355        {
356            for line in prev_line..token.line_num {
357                if let Some(line_ending) = line_endings.get(line.saturating_sub(1)) {
358                    rendered.push_str(line_ending.as_str());
359                }
360            }
361        }
362
363        let token_value = if token.is_text {
364            token.value.as_str()
365        } else {
366            token
367                .value
368                .strip_suffix("\r\n")
369                .or_else(|| token.value.strip_suffix('\n'))
370                .unwrap_or(token.value.as_str())
371        };
372
373        if token.is_text && !STOPWORDS.contains(token.value.to_lowercase().as_str()) {
374            if token.is_matched {
375                rendered.push_str(token_value);
376            } else {
377                rendered.push('[');
378                rendered.push_str(token_value);
379                rendered.push(']');
380            }
381        } else {
382            rendered.push_str(token_value);
383        }
384
385        previous_line = Some(token.line_num);
386    }
387
388    rendered
389}
390
391impl<'a> Query<'a> {
392    /// Create a new query from text string and license index.
393    ///
394    /// This tokenizes the input text, looks up each token in the index dictionary,
395    /// and builds the query structures for matching.
396    ///
397    /// # Arguments
398    /// * `text` - The input text to tokenize
399    /// * `index` - The license index containing the token dictionary
400    ///
401    /// # Returns
402    /// A Result containing the Query or an error if binary detection fails
403    ///
404    /// Detection scans file-like text, so this uses Python's
405    /// `build_query(..., text_line_threshold=15)` threshold.
406    const TEXT_LINE_THRESHOLD: usize = 15;
407    const BINARY_LINE_THRESHOLD: usize = 50;
408    const MAX_TOKEN_PER_LINE: usize = 25;
409
410    fn compute_spdx_offset(
411        tokens: &[QueryToken],
412        dictionary: &crate::license_detection::index::dictionary::TokenDictionary,
413    ) -> Option<usize> {
414        let get_known_id = |i: usize| -> Option<TokenId> {
415            match tokens.get(i)? {
416                QueryToken::Known(known) => Some(known.id),
417                _ => None,
418            }
419        };
420
421        let spdx_id = dictionary.get("spdx")?;
422        let license_id = dictionary.get("license")?;
423        let identifier_id = dictionary.get("identifier")?;
424        let licence_id = dictionary.get("licence");
425
426        let licenses_id = dictionary.get("licenses");
427        let nuget_id = dictionary.get("nuget");
428        let org_id = dictionary.get("org");
429
430        let is_spdx_prefix = |ids: [Option<TokenId>; 3]| -> bool {
431            ids.iter().all(|id| id.is_some())
432                && ids[0] == Some(spdx_id)
433                && (ids[1] == Some(license_id) || ids[1] == licence_id)
434                && ids[2] == Some(identifier_id)
435        };
436
437        let is_nuget_prefix = |ids: [Option<TokenId>; 3]| -> bool {
438            licenses_id.is_some()
439                && nuget_id.is_some()
440                && org_id.is_some()
441                && ids[0] == licenses_id
442                && ids[1] == Some(nuget_id.unwrap())
443                && ids[2] == Some(org_id.unwrap())
444        };
445
446        if tokens.len() >= 3 {
447            let first_three = [get_known_id(0), get_known_id(1), get_known_id(2)];
448            if is_spdx_prefix(first_three) || is_nuget_prefix(first_three) {
449                return Some(0);
450            }
451        }
452
453        if tokens.len() >= 4 {
454            let second_three = [get_known_id(1), get_known_id(2), get_known_id(3)];
455            if is_spdx_prefix(second_three) || is_nuget_prefix(second_three) {
456                return Some(1);
457            }
458        }
459
460        if tokens.len() >= 5 {
461            let third_three = [get_known_id(2), get_known_id(3), get_known_id(4)];
462            if is_spdx_prefix(third_three) || is_nuget_prefix(third_three) {
463                return Some(2);
464            }
465        }
466
467        None
468    }
469
470    pub fn from_extracted_text(
471        text: &str,
472        index: &'a LicenseIndex,
473        binary_derived: bool,
474    ) -> Result<Self, anyhow::Error> {
475        let line_threshold = if binary_derived {
476            Self::BINARY_LINE_THRESHOLD
477        } else {
478            Self::TEXT_LINE_THRESHOLD
479        };
480
481        Self::with_source_options(text, index, line_threshold, Some(binary_derived))
482    }
483
484    /// Iterate over query runs.
485    ///
486    /// Corresponds to Python: `query.query_runs` property iteration
487    pub fn query_runs(&self) -> Vec<QueryRun<'_>> {
488        self.query_run_ranges
489            .iter()
490            .map(|&(start, end)| QueryRun::new(self, start, end))
491            .collect()
492    }
493
494    fn with_source_options(
495        text: &str,
496        index: &'a LicenseIndex,
497        line_threshold: usize,
498        binary_derived: Option<bool>,
499    ) -> Result<Self, anyhow::Error> {
500        let is_binary = match binary_derived {
501            Some(is_binary) => is_binary,
502            None => Self::detect_binary(text)?,
503        };
504        let has_long_lines = Self::detect_long_lines(text);
505
506        let mut tokens = Vec::new();
507        let mut line_by_pos = Vec::new();
508        let mut unknowns_by_pos: HashMap<Option<i32>, usize> = HashMap::new();
509        let mut stopwords_by_pos: HashMap<Option<i32>, usize> = HashMap::new();
510        let mut shorts_and_digits_pos = HashSet::new();
511        let mut spdx_lines: Vec<(String, usize, usize)> = Vec::new();
512
513        let mut known_pos = -1i32;
514        let mut started = false;
515        let mut current_line = 1usize;
516
517        let mut tokens_by_line: Vec<Vec<Option<KnownToken>>> = Vec::new();
518
519        for line in text.lines() {
520            let line_trimmed = line.trim();
521            let mut line_tokens: Vec<Option<KnownToken>> = Vec::new();
522
523            let mut line_first_known_pos = None;
524
525            let line_query_tokens = tokenize_as_ids(line_trimmed, &index.dictionary);
526
527            for query_token in &line_query_tokens {
528                match query_token {
529                    QueryToken::Known(known_token) => {
530                        known_pos += 1;
531                        started = true;
532                        tokens.push(known_token.id);
533                        line_by_pos.push(current_line);
534                        line_tokens.push(Some(*known_token));
535
536                        if line_first_known_pos.is_none() {
537                            line_first_known_pos = Some(known_pos);
538                        }
539
540                        if known_token.is_short_or_digit {
541                            let _ = shorts_and_digits_pos.insert(known_pos as usize);
542                        }
543                    }
544                    QueryToken::Unknown if !started => {
545                        *unknowns_by_pos.entry(None).or_insert(0) += 1;
546                        line_tokens.push(None);
547                    }
548                    QueryToken::Unknown => {
549                        *unknowns_by_pos.entry(Some(known_pos)).or_insert(0) += 1;
550                        line_tokens.push(None);
551                    }
552                    QueryToken::Stopword if !started => {
553                        *stopwords_by_pos.entry(None).or_insert(0) += 1;
554                    }
555                    QueryToken::Stopword => {
556                        *stopwords_by_pos.entry(Some(known_pos)).or_insert(0) += 1;
557                    }
558                }
559            }
560
561            let line_last_known_pos = known_pos;
562
563            let spdx_start_offset =
564                Self::compute_spdx_offset(&line_query_tokens, &index.dictionary);
565
566            if let Some(offset) = spdx_start_offset
567                && let Some(line_first_known_pos) = line_first_known_pos
568            {
569                let spdx_start_known_pos = line_first_known_pos + offset as i32;
570                if spdx_start_known_pos <= line_last_known_pos {
571                    let spdx_start = spdx_start_known_pos as usize;
572                    let spdx_end = (line_last_known_pos + 1) as usize;
573                    spdx_lines.push((line_trimmed.to_string(), spdx_start, spdx_end));
574                }
575            }
576
577            tokens_by_line.push(line_tokens);
578            current_line += 1;
579        }
580
581        let high_matchables: BitSet = tokens
582            .iter()
583            .enumerate()
584            .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Legalese)
585            .map(|(pos, _tid)| pos)
586            .collect();
587
588        let low_matchables: BitSet = tokens
589            .iter()
590            .enumerate()
591            .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Regular)
592            .map(|(pos, _tid)| pos)
593            .collect();
594
595        let query_runs = Self::compute_query_runs(&tokens_by_line, line_threshold, has_long_lines);
596
597        Ok(Query {
598            text: text.to_string(),
599            tokens,
600            line_by_pos,
601            unknowns_by_pos,
602            stopwords_by_pos,
603            shorts_and_digits_pos,
604            high_matchables,
605            low_matchables,
606            is_binary,
607            query_run_ranges: query_runs,
608            spdx_lines,
609            index,
610        })
611    }
612
613    /// Detect if text is binary content.
614    ///
615    /// Binary detection checks for:
616    /// - Null bytes (0x00)
617    /// - High ratio of non-printable characters
618    ///
619    /// # Arguments
620    /// * `text` - The text to analyze
621    ///
622    /// # Returns
623    /// true if binary, false otherwise
624    ///
625    /// Corresponds to Python: `typecode.get_type().is_binary` usage (lines 123-135)
626    fn detect_binary(text: &str) -> Result<bool, anyhow::Error> {
627        let null_byte_count = text.bytes().filter(|&b| b == 0).count();
628
629        if null_byte_count > 0 {
630            return Ok(true);
631        }
632
633        let non_printable_ratio = text
634            .chars()
635            .filter(|&c| {
636                !c.is_ascii() && !c.is_ascii_graphic() && c != '\n' && c != '\r' && c != '\t'
637            })
638            .count() as f64
639            / text.len().max(1) as f64;
640
641        Ok(non_printable_ratio > 0.3)
642    }
643
644    /// Detect if text has very long lines (for minified JS/CSS).
645    ///
646    /// # Arguments
647    /// * `text` - The text to analyze
648    ///
649    /// # Returns
650    /// true if there are lines with many tokens, false otherwise
651    ///
652    /// Corresponds to Python: `typecode.get_type().is_text_with_long_lines` usage
653    fn detect_long_lines(text: &str) -> bool {
654        text.lines()
655            .any(|line| crate::license_detection::tokenize::count_tokens(line) > 25)
656    }
657
658    fn break_long_lines(lines: &[Vec<Option<KnownToken>>]) -> Vec<Vec<Option<KnownToken>>> {
659        lines
660            .iter()
661            .flat_map(|line| {
662                if line.is_empty() {
663                    return Vec::new();
664                }
665
666                if line.len() <= Self::MAX_TOKEN_PER_LINE {
667                    vec![line.clone()]
668                } else {
669                    line.chunks(Self::MAX_TOKEN_PER_LINE)
670                        .map(|chunk| chunk.to_vec())
671                        .collect()
672                }
673            })
674            .collect()
675    }
676
677    fn compute_query_runs(
678        tokens_by_line: &[Vec<Option<KnownToken>>],
679        line_threshold: usize,
680        has_long_lines: bool,
681    ) -> Vec<(usize, Option<usize>)> {
682        let processed_lines = if has_long_lines {
683            Self::break_long_lines(tokens_by_line)
684        } else {
685            tokens_by_line.to_vec()
686        };
687
688        let mut query_runs = Vec::new();
689        let mut query_run_start = 0usize;
690        let mut query_run_end = None;
691        let mut empty_lines = 0usize;
692        let mut pos = 0usize;
693        let mut query_run_is_all_digit = true;
694
695        for line_tokens in processed_lines {
696            if query_run_end.is_some() && empty_lines >= line_threshold {
697                if !query_run_is_all_digit {
698                    query_runs.push((query_run_start, query_run_end));
699                }
700                query_run_start = pos;
701                query_run_end = None;
702                empty_lines = 0;
703                query_run_is_all_digit = true;
704            }
705
706            if query_run_end.is_none() {
707                query_run_start = pos;
708            }
709
710            if line_tokens.is_empty() {
711                empty_lines += 1;
712                continue;
713            }
714
715            let line_is_all_digit = line_tokens
716                .iter()
717                .all(|token_id| token_id.map(|known| known.is_digit_only).unwrap_or(true));
718            let mut line_has_known_tokens = false;
719            let mut line_has_good_tokens = false;
720
721            for known in line_tokens.into_iter().flatten() {
722                line_has_known_tokens = true;
723                if known.kind == TokenKind::Legalese {
724                    line_has_good_tokens = true;
725                }
726                if !known.is_digit_only {
727                    query_run_is_all_digit = false;
728                }
729                query_run_end = Some(pos);
730                pos += 1;
731            }
732
733            if line_is_all_digit || !line_has_known_tokens {
734                empty_lines += 1;
735                continue;
736            }
737
738            if line_has_good_tokens {
739                empty_lines = 0;
740            } else {
741                empty_lines += 1;
742            }
743        }
744
745        if let Some(end) = query_run_end
746            && !query_run_is_all_digit
747        {
748            query_runs.push((query_run_start, Some(end)));
749        }
750
751        query_runs
752    }
753
754    /// Get the length of the query in tokens.
755    ///
756    /// Get the line number for a token position.
757    ///
758    /// # Arguments
759    /// * `pos` - The token position
760    ///
761    /// # Returns
762    /// The line number (1-based)
763    #[inline]
764    pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
765        self.line_by_pos.get(pos).copied()
766    }
767
768    /// Check if the query is empty (no known tokens).
769    #[inline]
770    pub fn is_empty(&self) -> bool {
771        self.tokens.is_empty()
772    }
773
774    /// Get a query run covering the entire query.
775    ///
776    /// Corresponds to Python: `whole_query_run()` method (lines 306-317)
777    pub fn whole_query_run(&self) -> QueryRun<'a> {
778        QueryRun::whole_query_snapshot(self)
779    }
780
781    /// Subtract matched span positions from matchables.
782    ///
783    /// This removes the positions from both high and low matchables.
784    ///
785    /// # Arguments
786    /// * `span` - The span of positions to subtract
787    ///
788    /// Corresponds to Python: `subtract()` method (lines 328-334)
789    pub fn subtract(&mut self, span: &PositionSpan) {
790        for pos in span.iter() {
791            self.high_matchables.remove(pos);
792            self.low_matchables.remove(pos);
793        }
794    }
795
796    /// Extract matched text for a given line range.
797    ///
798    /// Returns the text from the original input between start_line and end_line
799    /// (both inclusive, 1-indexed).
800    ///
801    /// # Arguments
802    /// * `start_line` - Starting line number (1-indexed)
803    /// * `end_line` - Ending line number (1-indexed)
804    ///
805    /// # Returns
806    /// The matched text, or empty string if lines are out of range
807    ///
808    /// Corresponds to Python: `matched_text()` method in match.py (lines 757-795)
809    pub fn matched_text(&self, start_line: usize, end_line: usize) -> String {
810        matched_text_from_text(&self.text, start_line, end_line)
811    }
812}
813
814#[derive(Debug, Clone)]
815struct WholeQueryRunSnapshot<'a> {
816    index: &'a LicenseIndex,
817    tokens: Vec<TokenId>,
818    line_by_pos: Vec<usize>,
819    high_matchables: BitSet,
820    low_matchables: BitSet,
821}
822
823/// A query run is a slice of query tokens identified by a start and end positions.
824///
825/// Query runs break a query into manageable chunks for efficient matching.
826/// They track matchable token positions and support subtraction of matched spans.
827///
828/// Based on Python QueryRun class at:
829/// reference/scancode-toolkit/src/licensedcode/query.py (lines 720-914)
830#[derive(Debug, Clone)]
831pub struct QueryRun<'a> {
832    query: Option<&'a Query<'a>>,
833    whole_query_snapshot: Option<WholeQueryRunSnapshot<'a>>,
834    pub start: usize,
835    pub end: Option<usize>,
836    cached_high_matchables: OnceCell<BitSet>,
837    cached_low_matchables: OnceCell<BitSet>,
838    combined_matchables: RefCell<Option<BitSet>>,
839}
840
841impl<'a> QueryRun<'a> {
842    /// Create a new query run from a query with start and end positions.
843    ///
844    /// # Arguments
845    /// * `query` - The parent query
846    /// * `start` - The start position (inclusive)
847    /// * `end` - The end position (inclusive), or None for an empty run
848    ///
849    /// Corresponds to Python: `QueryRun.__init__()` (lines 735-749)
850    pub fn new(query: &'a Query<'a>, start: usize, end: Option<usize>) -> Self {
851        Self {
852            query: Some(query),
853            whole_query_snapshot: None,
854            start,
855            end,
856            cached_high_matchables: OnceCell::new(),
857            cached_low_matchables: OnceCell::new(),
858            combined_matchables: RefCell::new(None),
859        }
860    }
861
862    fn whole_query_snapshot(query: &Query<'a>) -> Self {
863        let end = if query.is_empty() {
864            None
865        } else {
866            Some(query.tokens.len() - 1)
867        };
868
869        Self {
870            query: None,
871            whole_query_snapshot: Some(WholeQueryRunSnapshot {
872                index: query.index,
873                tokens: query.tokens.clone(),
874                line_by_pos: query.line_by_pos.clone(),
875                high_matchables: query.high_matchables.clone(),
876                low_matchables: query.low_matchables.clone(),
877            }),
878            start: 0,
879            end,
880            cached_high_matchables: OnceCell::new(),
881            cached_low_matchables: OnceCell::new(),
882            combined_matchables: RefCell::new(None),
883        }
884    }
885
886    fn source_tokens(&self) -> &[TokenId] {
887        if let Some(query) = self.query {
888            &query.tokens
889        } else {
890            &self
891                .whole_query_snapshot
892                .as_ref()
893                .expect("snapshot-backed whole query run should have snapshot data")
894                .tokens
895        }
896    }
897
898    fn source_line_by_pos(&self) -> &[usize] {
899        if let Some(query) = self.query {
900            &query.line_by_pos
901        } else {
902            &self
903                .whole_query_snapshot
904                .as_ref()
905                .expect("snapshot-backed whole query run should have snapshot data")
906                .line_by_pos
907        }
908    }
909
910    fn source_high_matchables(&self) -> &BitSet {
911        if let Some(query) = self.query {
912            &query.high_matchables
913        } else {
914            &self
915                .whole_query_snapshot
916                .as_ref()
917                .expect("snapshot-backed whole query run should have snapshot data")
918                .high_matchables
919        }
920    }
921
922    fn source_low_matchables(&self) -> &BitSet {
923        if let Some(query) = self.query {
924            &query.low_matchables
925        } else {
926            &self
927                .whole_query_snapshot
928                .as_ref()
929                .expect("snapshot-backed whole query run should have snapshot data")
930                .low_matchables
931        }
932    }
933
934    /// Get the license index used by this query run.
935    pub fn get_index(&self) -> &LicenseIndex {
936        if let Some(query) = self.query {
937            query.index
938        } else {
939            self.whole_query_snapshot
940                .as_ref()
941                .expect("snapshot-backed whole query run should have snapshot data")
942                .index
943        }
944    }
945
946    /// Get the line number for a specific token position.
947    ///
948    /// # Arguments
949    /// * `pos` - Absolute token position in the query
950    ///
951    /// # Returns
952    /// The line number (1-based), or None if position is out of range
953    pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
954        self.source_line_by_pos().get(pos).copied()
955    }
956
957    /// Get the sequence of token IDs for this run.
958    ///
959    /// Returns empty slice if end is None.
960    ///
961    /// Corresponds to Python: `tokens` property (lines 779-786)
962    pub fn tokens(&self) -> &[TokenId] {
963        match self.end {
964            Some(end) => &self.source_tokens()[self.start..=end],
965            None => &[],
966        }
967    }
968
969    /// Iterate over token IDs with their absolute positions.
970    ///
971    /// Corresponds to Python: `tokens_with_pos()` method (lines 788-789)
972    pub fn tokens_with_pos(&self) -> impl Iterator<Item = (usize, TokenId)> + '_ {
973        self.tokens()
974            .iter()
975            .copied()
976            .enumerate()
977            .map(|(i, tid)| (self.start + i, tid))
978    }
979
980    /// Check if this query run contains only digit tokens.
981    ///
982    /// Corresponds to Python: `is_digits_only()` method (lines 791-796)
983    pub fn is_digits_only(&self) -> bool {
984        self.tokens()
985            .iter()
986            .all(|&tid| self.get_index().dictionary.is_digit_only_token(tid))
987    }
988
989    /// Check if this query run has matchable tokens.
990    ///
991    /// # Arguments
992    /// * `include_low` - If true, include low-value tokens in the check
993    /// * `exclude_positions` - Optional set of spans containing positions to exclude
994    ///
995    /// Returns true if there are matchable tokens remaining
996    ///
997    /// Corresponds to Python: `is_matchable()` method (lines 798-818)
998    pub fn is_matchable(&self, include_low: bool, exclude_positions: &[PositionSpan]) -> bool {
999        if self.is_digits_only() {
1000            return false;
1001        }
1002
1003        let matchables = self.matchables(include_low);
1004
1005        if exclude_positions.is_empty() {
1006            return !matchables.is_empty();
1007        }
1008
1009        let mut matchable_set = matchables;
1010        for span in exclude_positions {
1011            for pos in span.iter() {
1012                matchable_set.remove(pos);
1013            }
1014        }
1015
1016        !matchable_set.is_empty()
1017    }
1018
1019    pub fn matchables(&self, include_low: bool) -> BitSet {
1020        if include_low {
1021            if let Some(ref cached) = *self.combined_matchables.borrow() {
1022                return cached.clone();
1023            }
1024            let combined: BitSet = self
1025                .low_matchables()
1026                .union(&self.high_matchables())
1027                .collect();
1028            *self.combined_matchables.borrow_mut() = Some(combined.clone());
1029            combined
1030        } else {
1031            self.high_matchables()
1032        }
1033    }
1034
1035    pub fn matchable_tokens(&self) -> Vec<i32> {
1036        let high_matchables = self.high_matchables();
1037        if high_matchables.is_empty() {
1038            return Vec::new();
1039        }
1040
1041        let matchables = self.matchables(true);
1042        self.tokens_with_pos()
1043            .map(|(pos, tid)| {
1044                if matchables.contains(pos) {
1045                    tid.raw() as i32
1046                } else {
1047                    -1
1048                }
1049            })
1050            .collect()
1051    }
1052
1053    pub fn high_matchables(&self) -> BitSet {
1054        self.cached_high_matchables
1055            .get_or_init(|| {
1056                let start = self.start;
1057                let end = self.end;
1058                let source = self.source_high_matchables();
1059                let live_span = PositionSpan::new(start, end.unwrap_or(usize::MAX));
1060                source
1061                    .iter()
1062                    .filter(|&pos| live_span.contains(pos))
1063                    .collect()
1064            })
1065            .clone()
1066    }
1067
1068    pub fn low_matchables(&self) -> BitSet {
1069        self.cached_low_matchables
1070            .get_or_init(|| {
1071                let start = self.start;
1072                let end = self.end;
1073                let source = self.source_low_matchables();
1074                let live_span = PositionSpan::new(start, end.unwrap_or(usize::MAX));
1075                source
1076                    .iter()
1077                    .filter(|&pos| live_span.contains(pos))
1078                    .collect()
1079            })
1080            .clone()
1081    }
1082}
1083
1084#[cfg(test)]
1085mod test;
provenant/license_detection/query/mod.rs

provenant/license_detection/query/
mod.rs