provenant/license_detection/query/
mod.rs

1//! Query processing - tokenized input for license matching.
2
3use crate::license_detection::index::LicenseIndex;
4use crate::license_detection::index::dictionary::{KnownToken, QueryToken, TokenId, TokenKind};
5use crate::license_detection::models::PositionSpan;
6use crate::license_detection::position_set::PositionSet;
7use crate::license_detection::spdx_lid::split_spdx_lid;
8use crate::license_detection::tokenize::STOPWORDS;
9use crate::license_detection::tokenize::tokenize_as_ids;
10use regex::Regex;
11use std::cell::{OnceCell, RefCell};
12use std::collections::HashMap;
13use std::sync::LazyLock;
14use std::time::Instant;
15
16static QUERY_PATTERN: LazyLock<Regex> =
17    LazyLock::new(|| Regex::new(r"[^_\W]+\+?[^_\W]*").expect("valid query regex"));
18static MATCHED_TEXT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
19    Regex::new(r"(?P<token>[^_\W]+\+?[^_\W]*)|(?P<punct>[_\W\s\+]+[_\W\s]?)")
20        .expect("valid matched text regex")
21});
22
23#[derive(Clone)]
24struct MatchedTextToken {
25    value: String,
26    line_num: usize,
27    pos: Option<usize>,
28    is_text: bool,
29    is_matched: bool,
30}
31
32///
33/// Query holds:
34/// - Known token IDs (tokens existing in the index dictionary)
35/// - Token positions and their corresponding line numbers (line_by_pos)
36/// - Unknown tokens (tokens not in dictionary) tracked per position
37/// - Stopwords tracked per position
38/// - Positions with short/digit-only tokens
39/// - High and low matchable token positions (for tracking what's been matched)
40///
41/// Based on Python Query class at:
42/// reference/scancode-toolkit/src/licensedcode/query.py (lines 155-295)
43#[derive(Debug)]
44pub struct Query<'a> {
45    /// The original input text.
46    ///
47    /// Corresponds to Python: `self.query_string` (line 215)
48    pub text: String,
49
50    /// Token IDs for known tokens (tokens found in the index dictionary)
51    ///
52    /// Corresponds to Python: `self.tokens = []` (line 228)
53    pub tokens: Vec<TokenId>,
54
55    /// Mapping from token position to line number (1-based)
56    ///
57    /// Each token position in `self.tokens` maps to the line number where it appears.
58    /// This is used for match position reporting.
59    ///
60    /// Corresponds to Python: `self.line_by_pos = []` (line 231)
61    pub line_by_pos: Vec<usize>,
62
63    /// Mapping from token position to count of unknown tokens after that position
64    ///
65    /// Unknown tokens are those not found in the dictionary. We track them by
66    /// counting how many unknown tokens appear after each known position.
67    /// Unknown tokens before the first known token are tracked with the key `None`.
68    ///
69    /// Corresponds to Python: `self.unknowns_by_pos = {}` (line 236)
70    pub unknowns_by_pos: HashMap<Option<usize>, usize>,
71
72    /// Mapping from token position to count of stopwords after that position
73    ///
74    /// Similar to unknown_tokens, but for stopwords.
75    ///
76    /// Corresponds to Python: `self.stopwords_by_pos = {}` (line 244)
77    pub stopwords_by_pos: HashMap<Option<usize>, usize>,
78
79    /// Set of positions with single-character or digit-only tokens
80    ///
81    /// These tokens have special handling in matching.
82    ///
83    /// Corresponds to Python: `self.shorts_and_digits_pos = set()` (line 249)
84    pub shorts_and_digits_pos: PositionSet,
85
86    /// High-value matchable token positions (legalese tokens)
87    ///
88    /// These are tokens with ID < len_legalese.
89    ///
90    /// Corresponds to Python: `self.high_matchables` (line 293)
91    pub high_matchables: PositionSet,
92
93    /// Low-value matchable token positions (non-legalese tokens)
94    ///
95    /// These are tokens with ID >= len_legalese.
96    ///
97    /// Corresponds to Python: `self.low_matchables` (line 294)
98    pub low_matchables: PositionSet,
99
100    /// True if the query is detected as binary content
101    ///
102    /// Corresponds to Python: `self.is_binary = False` (line 225)
103    pub is_binary: bool,
104
105    /// Raw query run ranges (start, end) computed during tokenization.
106    ///
107    /// QueryRuns are created on-demand from these ranges.
108    ///
109    /// Corresponds to Python: `self.query_runs = []` (line 274)
110    pub(crate) query_run_ranges: Vec<(usize, Option<usize>)>,
111
112    /// SPDX-License-Identifier lines found during tokenization.
113    ///
114    /// Each tuple is (spdx_text, start_token_pos, end_token_pos).
115    /// Used for creating LicenseMatches with correct token positions.
116    ///
117    /// Corresponds to Python: `self.spdx_lines = []` (line 507)
118    pub spdx_lines: Vec<(String, usize, usize)>,
119
120    /// Reference to the license index for dictionary access and metadata
121    pub index: &'a LicenseIndex,
122}
123
124pub fn matched_text_from_text(text: &str, start_line: usize, end_line: usize) -> String {
125    if start_line == 0 || end_line == 0 || start_line > end_line {
126        return String::new();
127    }
128
129    text.lines()
130        .enumerate()
131        .filter_map(|(idx, line)| {
132            let line_num = idx + 1;
133            if line_num >= start_line && line_num <= end_line {
134                Some(line)
135            } else {
136                None
137            }
138        })
139        .collect::<Vec<_>>()
140        .join("\n")
141}
142
143pub fn matched_text_diagnostics_from_text(
144    text: &str,
145    query: &Query<'_>,
146    matched_positions: &PositionSet,
147    start_pos: usize,
148    end_pos: usize,
149    start_line: usize,
150    end_line: usize,
151) -> String {
152    let tokens = tokenize_matched_text(text, query);
153    let reportable_tokens = collect_reportable_tokens(
154        tokens,
155        matched_positions,
156        start_pos,
157        end_pos,
158        start_line,
159        end_line,
160    );
161    let line_endings = collect_line_endings(text);
162
163    render_diagnostic_tokens(&reportable_tokens, &line_endings)
164}
165
166fn tokenize_matched_text(text: &str, query: &Query<'_>) -> Vec<MatchedTextToken> {
167    let mut tokens = Vec::new();
168    let mut pos = 0usize;
169    let mut line_num = 1usize;
170
171    for line in text.split_inclusive('\n') {
172        for capture in MATCHED_TEXT_PATTERN.captures_iter(line) {
173            if let Some(token_match) = capture.name("token") {
174                let token_text = token_match.as_str();
175                let retokenized: Vec<String> = QUERY_PATTERN
176                    .find_iter(&token_text.to_lowercase())
177                    .map(|m| m.as_str().to_string())
178                    .filter(|token| !STOPWORDS.contains(token.as_str()))
179                    .collect();
180
181                if retokenized.is_empty() {
182                    tokens.push(MatchedTextToken {
183                        value: token_text.to_string(),
184                        line_num,
185                        pos: None,
186                        is_text: true,
187                        is_matched: false,
188                    });
189                } else if retokenized.len() == 1 {
190                    let token = &retokenized[0];
191                    let token_pos = if query.index.dictionary.get(token).is_some() {
192                        let current_pos = pos;
193                        pos += 1;
194                        Some(current_pos)
195                    } else {
196                        None
197                    };
198
199                    tokens.push(MatchedTextToken {
200                        value: token_text.to_string(),
201                        line_num,
202                        pos: token_pos,
203                        is_text: true,
204                        is_matched: false,
205                    });
206                } else {
207                    for token in retokenized {
208                        let token_pos = if query.index.dictionary.get(&token).is_some() {
209                            let current_pos = pos;
210                            pos += 1;
211                            Some(current_pos)
212                        } else {
213                            None
214                        };
215
216                        tokens.push(MatchedTextToken {
217                            value: token,
218                            line_num,
219                            pos: token_pos,
220                            is_text: true,
221                            is_matched: false,
222                        });
223                    }
224                }
225            } else if let Some(punct_match) = capture.name("punct") {
226                tokens.push(MatchedTextToken {
227                    value: punct_match.as_str().to_string(),
228                    line_num,
229                    pos: None,
230                    is_text: false,
231                    is_matched: false,
232                });
233            }
234        }
235
236        line_num += 1;
237    }
238
239    tokens
240}
241
242fn collect_reportable_tokens(
243    tokens: Vec<MatchedTextToken>,
244    matched_positions: &PositionSet,
245    start_pos: usize,
246    end_pos: usize,
247    start_line: usize,
248    end_line: usize,
249) -> Vec<MatchedTextToken> {
250    let mut reportable = Vec::new();
251    let mut started = false;
252    let mut finished = false;
253    let mut end_real_pos = None;
254    let mut last_real_pos = None;
255
256    for (real_pos, mut token) in tokens.into_iter().enumerate() {
257        if token.line_num < start_line {
258            continue;
259        }
260
261        if token.line_num > end_line {
262            break;
263        }
264
265        let mut is_included = false;
266
267        if token.pos.is_some_and(|pos| matched_positions.contains(pos)) {
268            token.is_matched = true;
269            is_included = true;
270        }
271
272        if !started && token.pos == Some(start_pos) {
273            started = true;
274            is_included = true;
275        }
276
277        if started && !finished {
278            is_included = true;
279        }
280
281        if token.pos == Some(end_pos) {
282            finished = true;
283            started = false;
284            end_real_pos = Some(real_pos);
285        }
286
287        if finished && !started && end_real_pos.is_some() && last_real_pos == end_real_pos {
288            end_real_pos = None;
289            if !token.is_text && !token.value.trim().is_empty() {
290                is_included = true;
291            }
292        }
293
294        last_real_pos = Some(real_pos);
295
296        if is_included {
297            reportable.push(token);
298        }
299    }
300
301    reportable
302}
303
304fn collect_line_endings(text: &str) -> Vec<String> {
305    text.split_inclusive('\n')
306        .map(|line| {
307            if line.ends_with("\r\n") {
308                "\r\n".to_string()
309            } else if line.ends_with('\n') {
310                "\n".to_string()
311            } else {
312                String::new()
313            }
314        })
315        .collect()
316}
317
318fn render_diagnostic_tokens(tokens: &[MatchedTextToken], line_endings: &[String]) -> String {
319    let mut rendered = String::new();
320    let mut previous_line: Option<usize> = None;
321
322    for token in tokens {
323        if let Some(prev_line) = previous_line
324            && token.line_num > prev_line
325        {
326            for line in prev_line..token.line_num {
327                if let Some(line_ending) = line_endings.get(line.saturating_sub(1)) {
328                    rendered.push_str(line_ending.as_str());
329                }
330            }
331        }
332
333        let token_value = if token.is_text {
334            token.value.as_str()
335        } else {
336            token
337                .value
338                .strip_suffix("\r\n")
339                .or_else(|| token.value.strip_suffix('\n'))
340                .unwrap_or(token.value.as_str())
341        };
342
343        if token.is_text && !STOPWORDS.contains(token.value.to_lowercase().as_str()) {
344            if token.is_matched {
345                rendered.push_str(token_value);
346            } else {
347                rendered.push('[');
348                rendered.push_str(token_value);
349                rendered.push(']');
350            }
351        } else {
352            rendered.push_str(token_value);
353        }
354
355        previous_line = Some(token.line_num);
356    }
357
358    rendered
359}
360
361impl<'a> Query<'a> {
362    /// Create a new query from text string and license index.
363    ///
364    /// This tokenizes the input text, looks up each token in the index dictionary,
365    /// and builds the query structures for matching.
366    ///
367    /// # Arguments
368    /// * `text` - The input text to tokenize
369    /// * `index` - The license index containing the token dictionary
370    ///
371    /// # Returns
372    /// A Result containing the Query or an error if binary detection fails
373    ///
374    /// Detection scans file-like text, so this uses Python's
375    /// `build_query(..., text_line_threshold=15)` threshold.
376    const TEXT_LINE_THRESHOLD: usize = 15;
377    const BINARY_LINE_THRESHOLD: usize = 50;
378    const MAX_TOKEN_PER_LINE: usize = 25;
379
380    fn compute_spdx_offset(
381        tokens: &[QueryToken],
382        dictionary: &crate::license_detection::index::dictionary::TokenDictionary,
383    ) -> Option<usize> {
384        let get_known_id = |i: usize| -> Option<TokenId> {
385            match tokens.get(i)? {
386                QueryToken::Known(known) => Some(known.id),
387                _ => None,
388            }
389        };
390
391        let spdx_id = dictionary.get("spdx")?;
392        let license_id = dictionary.get("license")?;
393        let identifier_id = dictionary.get("identifier")?;
394        let licence_id = dictionary.get("licence");
395
396        let licenses_id = dictionary.get("licenses");
397        let nuget_id = dictionary.get("nuget");
398        let org_id = dictionary.get("org");
399
400        let is_spdx_prefix = |ids: [Option<TokenId>; 3]| -> bool {
401            ids.iter().all(|id| id.is_some())
402                && ids[0] == Some(spdx_id)
403                && (ids[1] == Some(license_id) || ids[1] == licence_id)
404                && ids[2] == Some(identifier_id)
405        };
406
407        let is_nuget_prefix = |ids: [Option<TokenId>; 3]| -> bool {
408            licenses_id.is_some()
409                && nuget_id.is_some()
410                && org_id.is_some()
411                && ids[0] == licenses_id
412                && ids[1] == Some(nuget_id.unwrap())
413                && ids[2] == Some(org_id.unwrap())
414        };
415
416        if tokens.len() >= 3 {
417            let first_three = [get_known_id(0), get_known_id(1), get_known_id(2)];
418            if is_spdx_prefix(first_three) || is_nuget_prefix(first_three) {
419                return Some(0);
420            }
421        }
422
423        if tokens.len() >= 4 {
424            let second_three = [get_known_id(1), get_known_id(2), get_known_id(3)];
425            if is_spdx_prefix(second_three) || is_nuget_prefix(second_three) {
426                return Some(1);
427            }
428        }
429
430        if tokens.len() >= 5 {
431            let third_three = [get_known_id(2), get_known_id(3), get_known_id(4)];
432            if is_spdx_prefix(third_three) || is_nuget_prefix(third_three) {
433                return Some(2);
434            }
435        }
436
437        None
438    }
439
440    pub fn from_extracted_text(
441        text: &str,
442        index: &'a LicenseIndex,
443        binary_derived: bool,
444    ) -> Result<Self, anyhow::Error> {
445        Self::from_extracted_text_with_deadline(text, index, binary_derived, None)
446    }
447
448    pub fn from_extracted_text_with_deadline(
449        text: &str,
450        index: &'a LicenseIndex,
451        binary_derived: bool,
452        deadline: Option<Instant>,
453    ) -> Result<Self, anyhow::Error> {
454        let line_threshold = if binary_derived {
455            Self::BINARY_LINE_THRESHOLD
456        } else {
457            Self::TEXT_LINE_THRESHOLD
458        };
459
460        Self::with_source_options(text, index, line_threshold, Some(binary_derived), deadline)
461    }
462
463    /// Iterate over query runs.
464    ///
465    /// Corresponds to Python: `query.query_runs` property iteration
466    pub fn query_runs(&self) -> Vec<QueryRun<'_>> {
467        self.query_run_ranges
468            .iter()
469            .map(|&(start, end)| QueryRun::new(self, start, end))
470            .collect()
471    }
472
473    fn with_source_options(
474        text: &str,
475        index: &'a LicenseIndex,
476        line_threshold: usize,
477        binary_derived: Option<bool>,
478        deadline: Option<Instant>,
479    ) -> Result<Self, anyhow::Error> {
480        crate::license_detection::ensure_within_deadline(deadline)?;
481        let is_binary = match binary_derived {
482            Some(is_binary) => is_binary,
483            None => Self::detect_binary(text)?,
484        };
485        let has_long_lines = Self::detect_long_lines(text);
486
487        let mut tokens = Vec::new();
488        let mut line_by_pos = Vec::new();
489        let mut unknowns_by_pos: HashMap<Option<usize>, usize> = HashMap::new();
490        let mut stopwords_by_pos: HashMap<Option<usize>, usize> = HashMap::new();
491        let mut shorts_and_digits_pos = PositionSet::new();
492        let mut spdx_lines: Vec<(String, usize, usize)> = Vec::new();
493
494        let mut known_pos: Option<usize> = None;
495        let mut started = false;
496        let mut current_line = 1usize;
497
498        let mut tokens_by_line: Vec<Vec<Option<KnownToken>>> = Vec::new();
499
500        for (line_index, line) in text.lines().enumerate() {
501            if line_index.is_multiple_of(128) {
502                crate::license_detection::ensure_within_deadline(deadline)?;
503            }
504
505            let line_trimmed = line.trim();
506            let mut line_tokens: Vec<Option<KnownToken>> = Vec::new();
507
508            let mut line_first_known_pos = None;
509
510            let line_query_tokens = tokenize_as_ids(line_trimmed, &index.dictionary);
511
512            for query_token in &line_query_tokens {
513                match query_token {
514                    QueryToken::Known(known_token) => {
515                        known_pos = Some(known_pos.map_or(0, |p| p + 1));
516                        started = true;
517                        tokens.push(known_token.id);
518                        line_by_pos.push(current_line);
519                        line_tokens.push(Some(*known_token));
520
521                        if line_first_known_pos.is_none() {
522                            line_first_known_pos = known_pos;
523                        }
524
525                        if known_token.is_short_or_digit {
526                            let _ = shorts_and_digits_pos.insert(known_pos.unwrap());
527                        }
528                    }
529                    QueryToken::Unknown if !started => {
530                        *unknowns_by_pos.entry(None).or_insert(0) += 1;
531                        line_tokens.push(None);
532                    }
533                    QueryToken::Unknown => {
534                        *unknowns_by_pos.entry(known_pos).or_insert(0) += 1;
535                        line_tokens.push(None);
536                    }
537                    QueryToken::Stopword if !started => {
538                        *stopwords_by_pos.entry(None).or_insert(0) += 1;
539                    }
540                    QueryToken::Stopword => {
541                        *stopwords_by_pos.entry(known_pos).or_insert(0) += 1;
542                    }
543                }
544            }
545
546            let line_last_known_pos = known_pos;
547
548            let spdx_start_offset =
549                Self::compute_spdx_offset(&line_query_tokens, &index.dictionary);
550
551            if let Some(offset) = spdx_start_offset
552                && let Some(line_first_known_pos) = line_first_known_pos
553            {
554                let (spdx_prefix, spdx_expression) = split_spdx_lid(line);
555                let spdx_text = format!("{}{}", spdx_prefix.unwrap_or_default(), spdx_expression);
556                let spdx_start_known_pos = line_first_known_pos + offset;
557
558                if spdx_start_known_pos <= line_last_known_pos.unwrap() {
559                    let spdx_end = line_last_known_pos.unwrap() + 1;
560                    spdx_lines.push((spdx_text, spdx_start_known_pos, spdx_end));
561                }
562            }
563
564            tokens_by_line.push(line_tokens);
565            current_line += 1;
566        }
567
568        crate::license_detection::ensure_within_deadline(deadline)?;
569
570        let high_matchables: PositionSet = tokens
571            .iter()
572            .enumerate()
573            .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Legalese)
574            .map(|(pos, _tid)| pos)
575            .collect();
576
577        let low_matchables: PositionSet = tokens
578            .iter()
579            .enumerate()
580            .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Regular)
581            .map(|(pos, _tid)| pos)
582            .collect();
583
584        let query_runs = Self::compute_query_runs(&tokens_by_line, line_threshold, has_long_lines);
585
586        Ok(Query {
587            text: text.to_string(),
588            tokens,
589            line_by_pos,
590            unknowns_by_pos,
591            stopwords_by_pos,
592            shorts_and_digits_pos,
593            high_matchables,
594            low_matchables,
595            is_binary,
596            query_run_ranges: query_runs,
597            spdx_lines,
598            index,
599        })
600    }
601
602    /// Detect if text is binary content.
603    ///
604    /// Binary detection checks for:
605    /// - Null bytes (0x00)
606    /// - High ratio of non-printable characters
607    ///
608    /// # Arguments
609    /// * `text` - The text to analyze
610    ///
611    /// # Returns
612    /// true if binary, false otherwise
613    ///
614    /// Corresponds to Python: `typecode.get_type().is_binary` usage (lines 123-135)
615    fn detect_binary(text: &str) -> Result<bool, anyhow::Error> {
616        let null_byte_count = text.bytes().filter(|&b| b == 0).count();
617
618        if null_byte_count > 0 {
619            return Ok(true);
620        }
621
622        let non_printable_ratio = text
623            .chars()
624            .filter(|&c| {
625                !c.is_ascii() && !c.is_ascii_graphic() && c != '\n' && c != '\r' && c != '\t'
626            })
627            .count() as f64
628            / text.len().max(1) as f64;
629
630        Ok(non_printable_ratio > 0.3)
631    }
632
633    /// Detect if text has very long lines (for minified JS/CSS).
634    ///
635    /// # Arguments
636    /// * `text` - The text to analyze
637    ///
638    /// # Returns
639    /// true if there are lines with many tokens, false otherwise
640    ///
641    /// Corresponds to Python: `typecode.get_type().is_text_with_long_lines` usage
642    fn detect_long_lines(text: &str) -> bool {
643        text.lines()
644            .any(|line| crate::license_detection::tokenize::count_tokens(line) > 25)
645    }
646
647    fn break_long_lines(lines: &[Vec<Option<KnownToken>>]) -> Vec<Vec<Option<KnownToken>>> {
648        lines
649            .iter()
650            .flat_map(|line| {
651                if line.is_empty() {
652                    return Vec::new();
653                }
654
655                if line.len() <= Self::MAX_TOKEN_PER_LINE {
656                    vec![line.clone()]
657                } else {
658                    line.chunks(Self::MAX_TOKEN_PER_LINE)
659                        .map(|chunk| chunk.to_vec())
660                        .collect()
661                }
662            })
663            .collect()
664    }
665
666    fn compute_query_runs(
667        tokens_by_line: &[Vec<Option<KnownToken>>],
668        line_threshold: usize,
669        has_long_lines: bool,
670    ) -> Vec<(usize, Option<usize>)> {
671        let processed_lines = if has_long_lines {
672            Self::break_long_lines(tokens_by_line)
673        } else {
674            tokens_by_line.to_vec()
675        };
676
677        let mut query_runs = Vec::new();
678        let mut query_run_start = 0usize;
679        let mut query_run_end = None;
680        let mut empty_lines = 0usize;
681        let mut pos = 0usize;
682        let mut query_run_is_all_digit = true;
683
684        for line_tokens in processed_lines {
685            if query_run_end.is_some() && empty_lines >= line_threshold {
686                if !query_run_is_all_digit {
687                    query_runs.push((query_run_start, query_run_end));
688                }
689                query_run_start = pos;
690                query_run_end = None;
691                empty_lines = 0;
692                query_run_is_all_digit = true;
693            }
694
695            if query_run_end.is_none() {
696                query_run_start = pos;
697            }
698
699            if line_tokens.is_empty() {
700                empty_lines += 1;
701                continue;
702            }
703
704            let line_is_all_digit = line_tokens
705                .iter()
706                .all(|token_id| token_id.map(|known| known.is_digit_only).unwrap_or(true));
707            let mut line_has_known_tokens = false;
708            let mut line_has_good_tokens = false;
709
710            for known in line_tokens.into_iter().flatten() {
711                line_has_known_tokens = true;
712                if known.kind == TokenKind::Legalese {
713                    line_has_good_tokens = true;
714                }
715                if !known.is_digit_only {
716                    query_run_is_all_digit = false;
717                }
718                query_run_end = Some(pos);
719                pos += 1;
720            }
721
722            if line_is_all_digit || !line_has_known_tokens {
723                empty_lines += 1;
724                continue;
725            }
726
727            if line_has_good_tokens {
728                empty_lines = 0;
729            } else {
730                empty_lines += 1;
731            }
732        }
733
734        if let Some(end) = query_run_end
735            && !query_run_is_all_digit
736        {
737            query_runs.push((query_run_start, Some(end)));
738        }
739
740        query_runs
741    }
742
743    /// Get the length of the query in tokens.
744    ///
745    /// Get the line number for a token position.
746    ///
747    /// # Arguments
748    /// * `pos` - The token position
749    ///
750    /// # Returns
751    /// The line number (1-based)
752    #[inline]
753    pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
754        self.line_by_pos.get(pos).copied()
755    }
756
757    /// Check if the query is empty (no known tokens).
758    #[inline]
759    pub fn is_empty(&self) -> bool {
760        self.tokens.is_empty()
761    }
762
763    /// Get a query run covering the entire query.
764    ///
765    /// Corresponds to Python: `whole_query_run()` method (lines 306-317)
766    pub fn whole_query_run(&self) -> QueryRun<'a> {
767        QueryRun::whole_query_snapshot(self)
768    }
769
770    /// Subtract matched span positions from matchables.
771    ///
772    /// This removes the positions from both high and low matchables.
773    ///
774    /// # Arguments
775    /// * `span` - The span of positions to subtract
776    ///
777    /// Corresponds to Python: `subtract()` method (lines 328-334)
778    pub fn subtract(&mut self, span: &PositionSpan) {
779        self.high_matchables.remove_span(span);
780        self.low_matchables.remove_span(span);
781    }
782
783    /// Extract matched text for a given line range.
784    ///
785    /// Returns the text from the original input between start_line and end_line
786    /// (both inclusive, 1-indexed).
787    ///
788    /// # Arguments
789    /// * `start_line` - Starting line number (1-indexed)
790    /// * `end_line` - Ending line number (1-indexed)
791    ///
792    /// # Returns
793    /// The matched text, or empty string if lines are out of range
794    ///
795    /// Corresponds to Python: `matched_text()` method in match.py (lines 757-795)
796    pub fn matched_text(&self, start_line: usize, end_line: usize) -> String {
797        matched_text_from_text(&self.text, start_line, end_line)
798    }
799}
800
801#[derive(Debug, Clone)]
802struct WholeQueryRunSnapshot<'a> {
803    index: &'a LicenseIndex,
804    tokens: Vec<TokenId>,
805    line_by_pos: Vec<usize>,
806    high_matchables: PositionSet,
807    low_matchables: PositionSet,
808}
809
810/// A query run is a slice of query tokens identified by a start and end positions.
811///
812/// Query runs break a query into manageable chunks for efficient matching.
813/// They track matchable token positions and support subtraction of matched spans.
814///
815/// Based on Python QueryRun class at:
816/// reference/scancode-toolkit/src/licensedcode/query.py (lines 720-914)
817#[derive(Debug, Clone)]
818pub struct QueryRun<'a> {
819    query: Option<&'a Query<'a>>,
820    whole_query_snapshot: Option<WholeQueryRunSnapshot<'a>>,
821    pub start: usize,
822    pub end: Option<usize>,
823    cached_high_matchables: OnceCell<PositionSet>,
824    cached_low_matchables: OnceCell<PositionSet>,
825    combined_matchables: RefCell<Option<PositionSet>>,
826}
827
828impl<'a> QueryRun<'a> {
829    /// Create a new query run from a query with start and end positions.
830    ///
831    /// # Arguments
832    /// * `query` - The parent query
833    /// * `start` - The start position (inclusive)
834    /// * `end` - The end position (inclusive), or None for an empty run
835    ///
836    /// Corresponds to Python: `QueryRun.__init__()` (lines 735-749)
837    pub fn new(query: &'a Query<'a>, start: usize, end: Option<usize>) -> Self {
838        Self {
839            query: Some(query),
840            whole_query_snapshot: None,
841            start,
842            end,
843            cached_high_matchables: OnceCell::new(),
844            cached_low_matchables: OnceCell::new(),
845            combined_matchables: RefCell::new(None),
846        }
847    }
848
849    fn whole_query_snapshot(query: &Query<'a>) -> Self {
850        let end = if query.is_empty() {
851            None
852        } else {
853            Some(query.tokens.len() - 1)
854        };
855
856        Self {
857            query: None,
858            whole_query_snapshot: Some(WholeQueryRunSnapshot {
859                index: query.index,
860                tokens: query.tokens.clone(),
861                line_by_pos: query.line_by_pos.clone(),
862                high_matchables: query.high_matchables.clone(),
863                low_matchables: query.low_matchables.clone(),
864            }),
865            start: 0,
866            end,
867            cached_high_matchables: OnceCell::new(),
868            cached_low_matchables: OnceCell::new(),
869            combined_matchables: RefCell::new(None),
870        }
871    }
872
873    fn source_tokens(&self) -> &[TokenId] {
874        if let Some(query) = self.query {
875            &query.tokens
876        } else {
877            &self
878                .whole_query_snapshot
879                .as_ref()
880                .expect("snapshot-backed whole query run should have snapshot data")
881                .tokens
882        }
883    }
884
885    fn source_line_by_pos(&self) -> &[usize] {
886        if let Some(query) = self.query {
887            &query.line_by_pos
888        } else {
889            &self
890                .whole_query_snapshot
891                .as_ref()
892                .expect("snapshot-backed whole query run should have snapshot data")
893                .line_by_pos
894        }
895    }
896
897    fn source_high_matchables(&self) -> &PositionSet {
898        if let Some(query) = self.query {
899            &query.high_matchables
900        } else {
901            &self
902                .whole_query_snapshot
903                .as_ref()
904                .expect("snapshot-backed whole query run should have snapshot data")
905                .high_matchables
906        }
907    }
908
909    fn source_low_matchables(&self) -> &PositionSet {
910        if let Some(query) = self.query {
911            &query.low_matchables
912        } else {
913            &self
914                .whole_query_snapshot
915                .as_ref()
916                .expect("snapshot-backed whole query run should have snapshot data")
917                .low_matchables
918        }
919    }
920
921    /// Get the license index used by this query run.
922    pub fn get_index(&self) -> &LicenseIndex {
923        if let Some(query) = self.query {
924            query.index
925        } else {
926            self.whole_query_snapshot
927                .as_ref()
928                .expect("snapshot-backed whole query run should have snapshot data")
929                .index
930        }
931    }
932
933    /// Get the line number for a specific token position.
934    ///
935    /// # Arguments
936    /// * `pos` - Absolute token position in the query
937    ///
938    /// # Returns
939    /// The line number (1-based), or None if position is out of range
940    pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
941        self.source_line_by_pos().get(pos).copied()
942    }
943
944    /// Get the sequence of token IDs for this run.
945    ///
946    /// Returns empty slice if end is None.
947    ///
948    /// Corresponds to Python: `tokens` property (lines 779-786)
949    pub fn tokens(&self) -> &[TokenId] {
950        match self.end {
951            Some(end) => &self.source_tokens()[self.start..=end],
952            None => &[],
953        }
954    }
955
956    /// Iterate over token IDs with their absolute positions.
957    ///
958    /// Corresponds to Python: `tokens_with_pos()` method (lines 788-789)
959    pub fn tokens_with_pos(&self) -> impl Iterator<Item = (usize, TokenId)> + '_ {
960        self.tokens()
961            .iter()
962            .copied()
963            .enumerate()
964            .map(|(i, tid)| (self.start + i, tid))
965    }
966
967    /// Check if this query run contains only digit tokens.
968    ///
969    /// Corresponds to Python: `is_digits_only()` method (lines 791-796)
970    pub fn is_digits_only(&self) -> bool {
971        self.tokens()
972            .iter()
973            .all(|&tid| self.get_index().dictionary.is_digit_only_token(tid))
974    }
975
976    /// Check if this query run has matchable tokens.
977    ///
978    /// # Arguments
979    /// * `include_low` - If true, include low-value tokens in the check
980    /// * `exclude_positions` - Optional set of spans containing positions to exclude
981    ///
982    /// Returns true if there are matchable tokens remaining
983    ///
984    /// Corresponds to Python: `is_matchable()` method (lines 798-818)
985    pub fn is_matchable(&self, include_low: bool, exclude_positions: &[PositionSpan]) -> bool {
986        if self.is_digits_only() {
987            return false;
988        }
989
990        let matchables = self.matchables(include_low);
991
992        if exclude_positions.is_empty() {
993            return !matchables.is_empty();
994        }
995
996        let mut matchable_set = matchables;
997        for span in exclude_positions {
998            matchable_set.remove_span(span);
999        }
1000
1001        !matchable_set.is_empty()
1002    }
1003
1004    pub fn matchables(&self, include_low: bool) -> PositionSet {
1005        if include_low {
1006            if let Some(ref cached) = *self.combined_matchables.borrow() {
1007                return cached.clone();
1008            }
1009            let combined = self.low_matchables().union(&self.high_matchables());
1010            *self.combined_matchables.borrow_mut() = Some(combined.clone());
1011            combined
1012        } else {
1013            self.high_matchables()
1014        }
1015    }
1016
1017    pub fn matchable_tokens(&self) -> Vec<Option<TokenId>> {
1018        let high_matchables = self.high_matchables();
1019        if high_matchables.is_empty() {
1020            return Vec::new();
1021        }
1022
1023        let matchables = self.matchables(true);
1024        self.tokens_with_pos()
1025            .map(|(pos, tid)| {
1026                if matchables.contains(pos) {
1027                    Some(tid)
1028                } else {
1029                    None
1030                }
1031            })
1032            .collect()
1033    }
1034
1035    pub fn high_matchables(&self) -> PositionSet {
1036        self.cached_high_matchables
1037            .get_or_init(|| {
1038                let start = self.start;
1039                let end = self.end.map(|e| e + 1).unwrap_or(usize::MAX);
1040                let source = self.source_high_matchables();
1041                let live_span = PositionSpan::new(start, end);
1042                source
1043                    .iter()
1044                    .filter(|&pos| live_span.contains(pos))
1045                    .collect()
1046            })
1047            .clone()
1048    }
1049
1050    pub fn low_matchables(&self) -> PositionSet {
1051        self.cached_low_matchables
1052            .get_or_init(|| {
1053                let start = self.start;
1054                let end = self.end.map(|e| e + 1).unwrap_or(usize::MAX);
1055                let source = self.source_low_matchables();
1056                let live_span = PositionSpan::new(start, end);
1057                source
1058                    .iter()
1059                    .filter(|&pos| live_span.contains(pos))
1060                    .collect()
1061            })
1062            .clone()
1063    }
1064}
1065
1066#[cfg(test)]
1067mod test;
provenant/license_detection/query/mod.rs

provenant/license_detection/query/
mod.rs