provenant/license_detection/query/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Query processing - tokenized input for license matching.
5
6use crate::license_detection::index::LicenseIndex;
7use crate::license_detection::index::dictionary::{KnownToken, QueryToken, TokenId, TokenKind};
8use crate::license_detection::models::PositionSpan;
9use crate::license_detection::position_set::PositionSet;
10use crate::license_detection::spdx_lid::split_spdx_lid;
11use crate::license_detection::tokenize::STOPWORDS;
12use crate::license_detection::tokenize::tokenize_as_ids;
13use regex::Regex;
14use std::cell::{OnceCell, RefCell};
15use std::collections::HashMap;
16use std::sync::LazyLock;
17use std::time::Instant;
18
19static QUERY_PATTERN: LazyLock<Regex> =
20    LazyLock::new(|| Regex::new(r"[^_\W]+\+?[^_\W]*").expect("valid query regex"));
21static MATCHED_TEXT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
22    Regex::new(r"(?P<token>[^_\W]+\+?[^_\W]*)|(?P<punct>[_\W\s\+]+[_\W\s]?)")
23        .expect("valid matched text regex")
24});
25
26#[derive(Clone)]
27struct MatchedTextToken {
28    value: String,
29    line_num: usize,
30    pos: Option<usize>,
31    is_text: bool,
32    is_matched: bool,
33}
34
35///
36/// Query holds:
37/// - Known token IDs (tokens existing in the index dictionary)
38/// - Token positions and their corresponding line numbers (line_by_pos)
39/// - Unknown tokens (tokens not in dictionary) tracked per position
40/// - Stopwords tracked per position
41/// - Positions with short/digit-only tokens
42/// - High and low matchable token positions (for tracking what's been matched)
43///
44/// Based on Python Query class at:
45/// reference/scancode-toolkit/src/licensedcode/query.py (lines 155-295)
46#[derive(Debug)]
47pub struct Query<'a> {
48    /// The original input text.
49    ///
50    /// Corresponds to Python: `self.query_string` (line 215)
51    pub text: String,
52
53    /// Token IDs for known tokens (tokens found in the index dictionary)
54    ///
55    /// Corresponds to Python: `self.tokens = []` (line 228)
56    pub tokens: Vec<TokenId>,
57
58    /// Mapping from token position to line number (1-based)
59    ///
60    /// Each token position in `self.tokens` maps to the line number where it appears.
61    /// This is used for match position reporting.
62    ///
63    /// Corresponds to Python: `self.line_by_pos = []` (line 231)
64    pub line_by_pos: Vec<usize>,
65
66    /// Mapping from token position to count of unknown tokens after that position
67    ///
68    /// Unknown tokens are those not found in the dictionary. We track them by
69    /// counting how many unknown tokens appear after each known position.
70    /// Unknown tokens before the first known token are tracked with the key `None`.
71    ///
72    /// Corresponds to Python: `self.unknowns_by_pos = {}` (line 236)
73    pub unknowns_by_pos: HashMap<Option<usize>, usize>,
74
75    /// Mapping from token position to count of stopwords after that position
76    ///
77    /// Similar to unknown_tokens, but for stopwords.
78    ///
79    /// Corresponds to Python: `self.stopwords_by_pos = {}` (line 244)
80    pub stopwords_by_pos: HashMap<Option<usize>, usize>,
81
82    /// Set of positions with single-character or digit-only tokens
83    ///
84    /// These tokens have special handling in matching.
85    ///
86    /// Corresponds to Python: `self.shorts_and_digits_pos = set()` (line 249)
87    pub shorts_and_digits_pos: PositionSet,
88
89    /// High-value matchable token positions (legalese tokens)
90    ///
91    /// These are tokens with ID < len_legalese.
92    ///
93    /// Corresponds to Python: `self.high_matchables` (line 293)
94    pub high_matchables: PositionSet,
95
96    /// Low-value matchable token positions (non-legalese tokens)
97    ///
98    /// These are tokens with ID >= len_legalese.
99    ///
100    /// Corresponds to Python: `self.low_matchables` (line 294)
101    pub low_matchables: PositionSet,
102
103    /// True if the query is detected as binary content
104    ///
105    /// Corresponds to Python: `self.is_binary = False` (line 225)
106    pub is_binary: bool,
107
108    /// Raw query run ranges (start, end) computed during tokenization.
109    ///
110    /// QueryRuns are created on-demand from these ranges.
111    ///
112    /// Corresponds to Python: `self.query_runs = []` (line 274)
113    pub(crate) query_run_ranges: Vec<(usize, Option<usize>)>,
114
115    /// SPDX-License-Identifier lines found during tokenization.
116    ///
117    /// Each tuple is (spdx_text, start_token_pos, end_token_pos).
118    /// Used for creating LicenseMatches with correct token positions.
119    ///
120    /// Corresponds to Python: `self.spdx_lines = []` (line 507)
121    pub spdx_lines: Vec<(String, usize, usize)>,
122
123    /// Reference to the license index for dictionary access and metadata
124    pub index: &'a LicenseIndex,
125}
126
127pub fn matched_text_from_text(text: &str, start_line: usize, end_line: usize) -> String {
128    if start_line == 0 || end_line == 0 || start_line > end_line {
129        return String::new();
130    }
131
132    text.lines()
133        .enumerate()
134        .filter_map(|(idx, line)| {
135            let line_num = idx + 1;
136            if line_num >= start_line && line_num <= end_line {
137                Some(line)
138            } else {
139                None
140            }
141        })
142        .collect::<Vec<_>>()
143        .join("\n")
144}
145
146pub fn matched_text_diagnostics_from_text(
147    text: &str,
148    query: &Query<'_>,
149    matched_positions: &PositionSet,
150    start_pos: usize,
151    end_pos: usize,
152    start_line: usize,
153    end_line: usize,
154) -> String {
155    let tokens = tokenize_matched_text(text, query);
156    let reportable_tokens = collect_reportable_tokens(
157        tokens,
158        matched_positions,
159        start_pos,
160        end_pos,
161        start_line,
162        end_line,
163    );
164    let line_endings = collect_line_endings(text);
165
166    render_diagnostic_tokens(&reportable_tokens, &line_endings)
167}
168
169fn tokenize_matched_text(text: &str, query: &Query<'_>) -> Vec<MatchedTextToken> {
170    let mut tokens = Vec::new();
171    let mut pos = 0usize;
172    for (line_num, line) in (1usize..).zip(text.split_inclusive('\n')) {
173        for capture in MATCHED_TEXT_PATTERN.captures_iter(line) {
174            if let Some(token_match) = capture.name("token") {
175                let token_text = token_match.as_str();
176                let retokenized: Vec<String> = QUERY_PATTERN
177                    .find_iter(&token_text.to_lowercase())
178                    .map(|m| m.as_str().to_string())
179                    .filter(|token| !STOPWORDS.contains(token.as_str()))
180                    .collect();
181
182                if retokenized.is_empty() {
183                    tokens.push(MatchedTextToken {
184                        value: token_text.to_string(),
185                        line_num,
186                        pos: None,
187                        is_text: true,
188                        is_matched: false,
189                    });
190                } else if retokenized.len() == 1 {
191                    let token = &retokenized[0];
192                    let token_pos = if query.index.dictionary.get(token).is_some() {
193                        let current_pos = pos;
194                        pos += 1;
195                        Some(current_pos)
196                    } else {
197                        None
198                    };
199
200                    tokens.push(MatchedTextToken {
201                        value: token_text.to_string(),
202                        line_num,
203                        pos: token_pos,
204                        is_text: true,
205                        is_matched: false,
206                    });
207                } else {
208                    for token in retokenized {
209                        let token_pos = if query.index.dictionary.get(&token).is_some() {
210                            let current_pos = pos;
211                            pos += 1;
212                            Some(current_pos)
213                        } else {
214                            None
215                        };
216
217                        tokens.push(MatchedTextToken {
218                            value: token,
219                            line_num,
220                            pos: token_pos,
221                            is_text: true,
222                            is_matched: false,
223                        });
224                    }
225                }
226            } else if let Some(punct_match) = capture.name("punct") {
227                tokens.push(MatchedTextToken {
228                    value: punct_match.as_str().to_string(),
229                    line_num,
230                    pos: None,
231                    is_text: false,
232                    is_matched: false,
233                });
234            }
235        }
236    }
237
238    tokens
239}
240
241fn collect_reportable_tokens(
242    tokens: Vec<MatchedTextToken>,
243    matched_positions: &PositionSet,
244    start_pos: usize,
245    end_pos: usize,
246    start_line: usize,
247    end_line: usize,
248) -> Vec<MatchedTextToken> {
249    let mut reportable = Vec::new();
250    let mut started = false;
251    let mut finished = false;
252    let mut end_real_pos = None;
253    let mut last_real_pos = None;
254
255    for (real_pos, mut token) in tokens.into_iter().enumerate() {
256        if token.line_num < start_line {
257            continue;
258        }
259
260        if token.line_num > end_line {
261            break;
262        }
263
264        let mut is_included = false;
265
266        if token.pos.is_some_and(|pos| matched_positions.contains(pos)) {
267            token.is_matched = true;
268            is_included = true;
269        }
270
271        if !started && token.pos == Some(start_pos) {
272            started = true;
273            is_included = true;
274        }
275
276        if started && !finished {
277            is_included = true;
278        }
279
280        if token.pos == Some(end_pos) {
281            finished = true;
282            started = false;
283            end_real_pos = Some(real_pos);
284        }
285
286        if finished && !started && end_real_pos.is_some() && last_real_pos == end_real_pos {
287            end_real_pos = None;
288            if !token.is_text && !token.value.trim().is_empty() {
289                is_included = true;
290            }
291        }
292
293        last_real_pos = Some(real_pos);
294
295        if is_included {
296            reportable.push(token);
297        }
298    }
299
300    reportable
301}
302
303fn collect_line_endings(text: &str) -> Vec<String> {
304    text.split_inclusive('\n')
305        .map(|line| {
306            if line.ends_with("\r\n") {
307                "\r\n".to_string()
308            } else if line.ends_with('\n') {
309                "\n".to_string()
310            } else {
311                String::new()
312            }
313        })
314        .collect()
315}
316
317fn render_diagnostic_tokens(tokens: &[MatchedTextToken], line_endings: &[String]) -> String {
318    let mut rendered = String::new();
319    let mut previous_line: Option<usize> = None;
320
321    for token in tokens {
322        if let Some(prev_line) = previous_line
323            && token.line_num > prev_line
324        {
325            for line in prev_line..token.line_num {
326                if let Some(line_ending) = line_endings.get(line.saturating_sub(1)) {
327                    rendered.push_str(line_ending.as_str());
328                }
329            }
330        }
331
332        let token_value = if token.is_text {
333            token.value.as_str()
334        } else {
335            token
336                .value
337                .strip_suffix("\r\n")
338                .or_else(|| token.value.strip_suffix('\n'))
339                .unwrap_or(token.value.as_str())
340        };
341
342        if token.is_text && !STOPWORDS.contains(token.value.to_lowercase().as_str()) {
343            if token.is_matched {
344                rendered.push_str(token_value);
345            } else {
346                rendered.push('[');
347                rendered.push_str(token_value);
348                rendered.push(']');
349            }
350        } else {
351            rendered.push_str(token_value);
352        }
353
354        previous_line = Some(token.line_num);
355    }
356
357    rendered
358}
359
360impl<'a> Query<'a> {
361    /// Create a new query from text string and license index.
362    ///
363    /// This tokenizes the input text, looks up each token in the index dictionary,
364    /// and builds the query structures for matching.
365    ///
366    /// # Arguments
367    /// * `text` - The input text to tokenize
368    /// * `index` - The license index containing the token dictionary
369    ///
370    /// # Returns
371    /// A Result containing the Query or an error if binary detection fails
372    ///
373    /// Detection scans file-like text, so this uses Python's
374    /// `build_query(..., text_line_threshold=15)` threshold.
375    const TEXT_LINE_THRESHOLD: usize = 15;
376    const BINARY_LINE_THRESHOLD: usize = 50;
377    const MAX_TOKEN_PER_LINE: usize = 25;
378
379    fn compute_spdx_offset(
380        tokens: &[QueryToken],
381        dictionary: &crate::license_detection::index::dictionary::TokenDictionary,
382    ) -> Option<usize> {
383        let get_known_id = |i: usize| -> Option<TokenId> {
384            match tokens.get(i)? {
385                QueryToken::Known(known) => Some(known.id),
386                _ => None,
387            }
388        };
389
390        let spdx_id = dictionary.get("spdx")?;
391        let license_id = dictionary.get("license")?;
392        let identifier_id = dictionary.get("identifier")?;
393        let licence_id = dictionary.get("licence");
394
395        let licenses_id = dictionary.get("licenses");
396        let nuget_id = dictionary.get("nuget");
397        let org_id = dictionary.get("org");
398
399        let is_spdx_prefix = |ids: [Option<TokenId>; 3]| -> bool {
400            ids.iter().all(|id| id.is_some())
401                && ids[0] == Some(spdx_id)
402                && (ids[1] == Some(license_id) || ids[1] == licence_id)
403                && ids[2] == Some(identifier_id)
404        };
405
406        let is_nuget_prefix = |ids: [Option<TokenId>; 3]| -> bool {
407            licenses_id.is_some()
408                && nuget_id.is_some()
409                && org_id.is_some()
410                && ids[0] == licenses_id
411                && ids[1] == Some(nuget_id.unwrap())
412                && ids[2] == Some(org_id.unwrap())
413        };
414
415        if tokens.len() >= 3 {
416            let first_three = [get_known_id(0), get_known_id(1), get_known_id(2)];
417            if is_spdx_prefix(first_three) || is_nuget_prefix(first_three) {
418                return Some(0);
419            }
420        }
421
422        if tokens.len() >= 4 {
423            let second_three = [get_known_id(1), get_known_id(2), get_known_id(3)];
424            if is_spdx_prefix(second_three) || is_nuget_prefix(second_three) {
425                return Some(1);
426            }
427        }
428
429        if tokens.len() >= 5 {
430            let third_three = [get_known_id(2), get_known_id(3), get_known_id(4)];
431            if is_spdx_prefix(third_three) || is_nuget_prefix(third_three) {
432                return Some(2);
433            }
434        }
435
436        None
437    }
438
439    pub fn from_extracted_text(
440        text: &str,
441        index: &'a LicenseIndex,
442        binary_derived: bool,
443    ) -> Result<Self, anyhow::Error> {
444        Self::from_extracted_text_with_deadline(text, index, binary_derived, None)
445    }
446
447    pub fn from_extracted_text_with_deadline(
448        text: &str,
449        index: &'a LicenseIndex,
450        binary_derived: bool,
451        deadline: Option<Instant>,
452    ) -> Result<Self, anyhow::Error> {
453        let line_threshold = if binary_derived {
454            Self::BINARY_LINE_THRESHOLD
455        } else {
456            Self::TEXT_LINE_THRESHOLD
457        };
458
459        Self::with_source_options(text, index, line_threshold, Some(binary_derived), deadline)
460    }
461
462    /// Iterate over query runs.
463    ///
464    /// Corresponds to Python: `query.query_runs` property iteration
465    pub fn query_runs(&self) -> Vec<QueryRun<'_>> {
466        self.query_run_ranges
467            .iter()
468            .map(|&(start, end)| QueryRun::new(self, start, end))
469            .collect()
470    }
471
472    fn with_source_options(
473        text: &str,
474        index: &'a LicenseIndex,
475        line_threshold: usize,
476        binary_derived: Option<bool>,
477        deadline: Option<Instant>,
478    ) -> Result<Self, anyhow::Error> {
479        crate::license_detection::ensure_within_deadline(deadline)?;
480        let is_binary = match binary_derived {
481            Some(is_binary) => is_binary,
482            None => Self::detect_binary(text)?,
483        };
484        let has_long_lines = Self::detect_long_lines(text);
485
486        let mut tokens = Vec::new();
487        let mut line_by_pos = Vec::new();
488        let mut unknowns_by_pos: HashMap<Option<usize>, usize> = HashMap::new();
489        let mut stopwords_by_pos: HashMap<Option<usize>, usize> = HashMap::new();
490        let mut shorts_and_digits_pos = PositionSet::new();
491        let mut spdx_lines: Vec<(String, usize, usize)> = Vec::new();
492
493        let mut known_pos: Option<usize> = None;
494        let mut started = false;
495        let mut tokens_by_line: Vec<Vec<Option<KnownToken>>> = Vec::new();
496
497        for (current_line, (line_index, line)) in (1usize..).zip(text.lines().enumerate()) {
498            if line_index.is_multiple_of(128) {
499                crate::license_detection::ensure_within_deadline(deadline)?;
500            }
501
502            let line_trimmed = line.trim();
503            let mut line_tokens: Vec<Option<KnownToken>> = Vec::new();
504
505            let mut line_first_known_pos = None;
506
507            let line_query_tokens = tokenize_as_ids(line_trimmed, &index.dictionary);
508
509            for query_token in &line_query_tokens {
510                match query_token {
511                    QueryToken::Known(known_token) => {
512                        known_pos = Some(known_pos.map_or(0, |p| p + 1));
513                        started = true;
514                        tokens.push(known_token.id);
515                        line_by_pos.push(current_line);
516                        line_tokens.push(Some(*known_token));
517
518                        if line_first_known_pos.is_none() {
519                            line_first_known_pos = known_pos;
520                        }
521
522                        if known_token.is_short_or_digit {
523                            let _ = shorts_and_digits_pos.insert(known_pos.unwrap());
524                        }
525                    }
526                    QueryToken::Unknown if !started => {
527                        *unknowns_by_pos.entry(None).or_insert(0) += 1;
528                        line_tokens.push(None);
529                    }
530                    QueryToken::Unknown => {
531                        *unknowns_by_pos.entry(known_pos).or_insert(0) += 1;
532                        line_tokens.push(None);
533                    }
534                    QueryToken::Stopword if !started => {
535                        *stopwords_by_pos.entry(None).or_insert(0) += 1;
536                    }
537                    QueryToken::Stopword => {
538                        *stopwords_by_pos.entry(known_pos).or_insert(0) += 1;
539                    }
540                }
541            }
542
543            let line_last_known_pos = known_pos;
544
545            let spdx_start_offset =
546                Self::compute_spdx_offset(&line_query_tokens, &index.dictionary);
547
548            if let Some(offset) = spdx_start_offset
549                && let Some(line_first_known_pos) = line_first_known_pos
550            {
551                let (spdx_prefix, spdx_expression) = split_spdx_lid(line);
552                let spdx_text = format!("{}{}", spdx_prefix.unwrap_or_default(), spdx_expression);
553                let spdx_start_known_pos = line_first_known_pos + offset;
554
555                if spdx_start_known_pos <= line_last_known_pos.unwrap() {
556                    let spdx_end = line_last_known_pos.unwrap() + 1;
557                    spdx_lines.push((spdx_text, spdx_start_known_pos, spdx_end));
558                }
559            }
560            tokens_by_line.push(line_tokens);
561        }
562
563        crate::license_detection::ensure_within_deadline(deadline)?;
564
565        let high_matchables: PositionSet = tokens
566            .iter()
567            .enumerate()
568            .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Legalese)
569            .map(|(pos, _tid)| pos)
570            .collect();
571
572        let low_matchables: PositionSet = tokens
573            .iter()
574            .enumerate()
575            .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Regular)
576            .map(|(pos, _tid)| pos)
577            .collect();
578
579        let query_runs = Self::compute_query_runs(&tokens_by_line, line_threshold, has_long_lines);
580
581        Ok(Query {
582            text: text.to_string(),
583            tokens,
584            line_by_pos,
585            unknowns_by_pos,
586            stopwords_by_pos,
587            shorts_and_digits_pos,
588            high_matchables,
589            low_matchables,
590            is_binary,
591            query_run_ranges: query_runs,
592            spdx_lines,
593            index,
594        })
595    }
596
597    /// Detect if text is binary content.
598    ///
599    /// Binary detection checks for:
600    /// - Null bytes (0x00)
601    /// - High ratio of non-printable characters
602    ///
603    /// # Arguments
604    /// * `text` - The text to analyze
605    ///
606    /// # Returns
607    /// true if binary, false otherwise
608    ///
609    /// Corresponds to Python: `typecode.get_type().is_binary` usage (lines 123-135)
610    fn detect_binary(text: &str) -> Result<bool, anyhow::Error> {
611        let null_byte_count = text.bytes().filter(|&b| b == 0).count();
612
613        if null_byte_count > 0 {
614            return Ok(true);
615        }
616
617        let non_printable_ratio = text
618            .chars()
619            .filter(|&c| {
620                !c.is_ascii() && !c.is_ascii_graphic() && c != '\n' && c != '\r' && c != '\t'
621            })
622            .count() as f64
623            / text.len().max(1) as f64;
624
625        Ok(non_printable_ratio > 0.3)
626    }
627
628    /// Detect if text has very long lines (for minified JS/CSS).
629    ///
630    /// # Arguments
631    /// * `text` - The text to analyze
632    ///
633    /// # Returns
634    /// true if there are lines with many tokens, false otherwise
635    ///
636    /// Corresponds to Python: `typecode.get_type().is_text_with_long_lines` usage
637    fn detect_long_lines(text: &str) -> bool {
638        text.lines()
639            .any(|line| crate::license_detection::tokenize::count_tokens(line) > 25)
640    }
641
642    fn break_long_lines(lines: &[Vec<Option<KnownToken>>]) -> Vec<Vec<Option<KnownToken>>> {
643        lines
644            .iter()
645            .flat_map(|line| {
646                if line.is_empty() {
647                    return Vec::new();
648                }
649
650                if line.len() <= Self::MAX_TOKEN_PER_LINE {
651                    vec![line.clone()]
652                } else {
653                    line.chunks(Self::MAX_TOKEN_PER_LINE)
654                        .map(|chunk| chunk.to_vec())
655                        .collect()
656                }
657            })
658            .collect()
659    }
660
661    fn compute_query_runs(
662        tokens_by_line: &[Vec<Option<KnownToken>>],
663        line_threshold: usize,
664        has_long_lines: bool,
665    ) -> Vec<(usize, Option<usize>)> {
666        let processed_lines = if has_long_lines {
667            Self::break_long_lines(tokens_by_line)
668        } else {
669            tokens_by_line.to_vec()
670        };
671
672        let mut query_runs = Vec::new();
673        let mut query_run_start = 0usize;
674        let mut query_run_end = None;
675        let mut empty_lines = 0usize;
676        let mut pos = 0usize;
677        let mut query_run_is_all_digit = true;
678
679        for line_tokens in processed_lines {
680            if query_run_end.is_some() && empty_lines >= line_threshold {
681                if !query_run_is_all_digit {
682                    query_runs.push((query_run_start, query_run_end));
683                }
684                query_run_start = pos;
685                query_run_end = None;
686                empty_lines = 0;
687                query_run_is_all_digit = true;
688            }
689
690            if query_run_end.is_none() {
691                query_run_start = pos;
692            }
693
694            if line_tokens.is_empty() {
695                empty_lines += 1;
696                continue;
697            }
698
699            let line_is_all_digit = line_tokens
700                .iter()
701                .all(|token_id| token_id.map(|known| known.is_digit_only).unwrap_or(true));
702            let mut line_has_known_tokens = false;
703            let mut line_has_good_tokens = false;
704
705            for known in line_tokens.into_iter().flatten() {
706                line_has_known_tokens = true;
707                if known.kind == TokenKind::Legalese {
708                    line_has_good_tokens = true;
709                }
710                if !known.is_digit_only {
711                    query_run_is_all_digit = false;
712                }
713                query_run_end = Some(pos);
714                pos += 1;
715            }
716
717            if line_is_all_digit || !line_has_known_tokens {
718                empty_lines += 1;
719                continue;
720            }
721
722            if line_has_good_tokens {
723                empty_lines = 0;
724            } else {
725                empty_lines += 1;
726            }
727        }
728
729        if let Some(end) = query_run_end
730            && !query_run_is_all_digit
731        {
732            query_runs.push((query_run_start, Some(end)));
733        }
734
735        query_runs
736    }
737
738    /// Get the length of the query in tokens.
739    ///
740    /// Get the line number for a token position.
741    ///
742    /// # Arguments
743    /// * `pos` - The token position
744    ///
745    /// # Returns
746    /// The line number (1-based)
747    #[inline]
748    pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
749        self.line_by_pos.get(pos).copied()
750    }
751
752    /// Check if the query is empty (no known tokens).
753    #[inline]
754    pub fn is_empty(&self) -> bool {
755        self.tokens.is_empty()
756    }
757
758    /// Get a query run covering the entire query.
759    ///
760    /// Corresponds to Python: `whole_query_run()` method (lines 306-317)
761    pub fn whole_query_run(&self) -> QueryRun<'a> {
762        QueryRun::whole_query_snapshot(self)
763    }
764
765    /// Subtract matched span positions from matchables.
766    ///
767    /// This removes the positions from both high and low matchables.
768    ///
769    /// # Arguments
770    /// * `span` - The span of positions to subtract
771    ///
772    /// Corresponds to Python: `subtract()` method (lines 328-334)
773    pub fn subtract(&mut self, span: &PositionSpan) {
774        self.high_matchables.remove_span(span);
775        self.low_matchables.remove_span(span);
776    }
777
778    /// Extract matched text for a given line range.
779    ///
780    /// Returns the text from the original input between start_line and end_line
781    /// (both inclusive, 1-indexed).
782    ///
783    /// # Arguments
784    /// * `start_line` - Starting line number (1-indexed)
785    /// * `end_line` - Ending line number (1-indexed)
786    ///
787    /// # Returns
788    /// The matched text, or empty string if lines are out of range
789    ///
790    /// Corresponds to Python: `matched_text()` method in match.py (lines 757-795)
791    pub fn matched_text(&self, start_line: usize, end_line: usize) -> String {
792        matched_text_from_text(&self.text, start_line, end_line)
793    }
794}
795
796#[derive(Debug, Clone)]
797struct WholeQueryRunSnapshot<'a> {
798    index: &'a LicenseIndex,
799    tokens: Vec<TokenId>,
800    line_by_pos: Vec<usize>,
801    high_matchables: PositionSet,
802    low_matchables: PositionSet,
803}
804
805/// A query run is a slice of query tokens identified by a start and end positions.
806///
807/// Query runs break a query into manageable chunks for efficient matching.
808/// They track matchable token positions and support subtraction of matched spans.
809///
810/// Based on Python QueryRun class at:
811/// reference/scancode-toolkit/src/licensedcode/query.py (lines 720-914)
812#[derive(Debug, Clone)]
813pub struct QueryRun<'a> {
814    query: Option<&'a Query<'a>>,
815    whole_query_snapshot: Option<WholeQueryRunSnapshot<'a>>,
816    pub start: usize,
817    pub end: Option<usize>,
818    cached_high_matchables: OnceCell<PositionSet>,
819    cached_low_matchables: OnceCell<PositionSet>,
820    combined_matchables: RefCell<Option<PositionSet>>,
821}
822
823impl<'a> QueryRun<'a> {
824    /// Create a new query run from a query with start and end positions.
825    ///
826    /// # Arguments
827    /// * `query` - The parent query
828    /// * `start` - The start position (inclusive)
829    /// * `end` - The end position (inclusive), or None for an empty run
830    ///
831    /// Corresponds to Python: `QueryRun.__init__()` (lines 735-749)
832    pub fn new(query: &'a Query<'a>, start: usize, end: Option<usize>) -> Self {
833        Self {
834            query: Some(query),
835            whole_query_snapshot: None,
836            start,
837            end,
838            cached_high_matchables: OnceCell::new(),
839            cached_low_matchables: OnceCell::new(),
840            combined_matchables: RefCell::new(None),
841        }
842    }
843
844    fn whole_query_snapshot(query: &Query<'a>) -> Self {
845        let end = if query.is_empty() {
846            None
847        } else {
848            Some(query.tokens.len() - 1)
849        };
850
851        Self {
852            query: None,
853            whole_query_snapshot: Some(WholeQueryRunSnapshot {
854                index: query.index,
855                tokens: query.tokens.clone(),
856                line_by_pos: query.line_by_pos.clone(),
857                high_matchables: query.high_matchables.clone(),
858                low_matchables: query.low_matchables.clone(),
859            }),
860            start: 0,
861            end,
862            cached_high_matchables: OnceCell::new(),
863            cached_low_matchables: OnceCell::new(),
864            combined_matchables: RefCell::new(None),
865        }
866    }
867
868    fn source_tokens(&self) -> &[TokenId] {
869        if let Some(query) = self.query {
870            &query.tokens
871        } else {
872            &self
873                .whole_query_snapshot
874                .as_ref()
875                .expect("snapshot-backed whole query run should have snapshot data")
876                .tokens
877        }
878    }
879
880    fn source_line_by_pos(&self) -> &[usize] {
881        if let Some(query) = self.query {
882            &query.line_by_pos
883        } else {
884            &self
885                .whole_query_snapshot
886                .as_ref()
887                .expect("snapshot-backed whole query run should have snapshot data")
888                .line_by_pos
889        }
890    }
891
892    fn source_high_matchables(&self) -> &PositionSet {
893        if let Some(query) = self.query {
894            &query.high_matchables
895        } else {
896            &self
897                .whole_query_snapshot
898                .as_ref()
899                .expect("snapshot-backed whole query run should have snapshot data")
900                .high_matchables
901        }
902    }
903
904    fn source_low_matchables(&self) -> &PositionSet {
905        if let Some(query) = self.query {
906            &query.low_matchables
907        } else {
908            &self
909                .whole_query_snapshot
910                .as_ref()
911                .expect("snapshot-backed whole query run should have snapshot data")
912                .low_matchables
913        }
914    }
915
916    /// Get the license index used by this query run.
917    pub fn get_index(&self) -> &LicenseIndex {
918        if let Some(query) = self.query {
919            query.index
920        } else {
921            self.whole_query_snapshot
922                .as_ref()
923                .expect("snapshot-backed whole query run should have snapshot data")
924                .index
925        }
926    }
927
928    /// Get the line number for a specific token position.
929    ///
930    /// # Arguments
931    /// * `pos` - Absolute token position in the query
932    ///
933    /// # Returns
934    /// The line number (1-based), or None if position is out of range
935    pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
936        self.source_line_by_pos().get(pos).copied()
937    }
938
939    /// Get the sequence of token IDs for this run.
940    ///
941    /// Returns empty slice if end is None.
942    ///
943    /// Corresponds to Python: `tokens` property (lines 779-786)
944    pub fn tokens(&self) -> &[TokenId] {
945        match self.end {
946            Some(end) => &self.source_tokens()[self.start..=end],
947            None => &[],
948        }
949    }
950
951    /// Iterate over token IDs with their absolute positions.
952    ///
953    /// Corresponds to Python: `tokens_with_pos()` method (lines 788-789)
954    pub fn tokens_with_pos(&self) -> impl Iterator<Item = (usize, TokenId)> + '_ {
955        self.tokens()
956            .iter()
957            .copied()
958            .enumerate()
959            .map(|(i, tid)| (self.start + i, tid))
960    }
961
962    /// Check if this query run contains only digit tokens.
963    ///
964    /// Corresponds to Python: `is_digits_only()` method (lines 791-796)
965    pub fn is_digits_only(&self) -> bool {
966        self.tokens()
967            .iter()
968            .all(|&tid| self.get_index().dictionary.is_digit_only_token(tid))
969    }
970
971    /// Check if this query run has matchable tokens.
972    ///
973    /// # Arguments
974    /// * `include_low` - If true, include low-value tokens in the check
975    /// * `exclude_positions` - Optional set of spans containing positions to exclude
976    ///
977    /// Returns true if there are matchable tokens remaining
978    ///
979    /// Corresponds to Python: `is_matchable()` method (lines 798-818)
980    pub fn is_matchable(&self, include_low: bool, exclude_positions: &[PositionSpan]) -> bool {
981        if self.is_digits_only() {
982            return false;
983        }
984
985        let matchables = self.matchables(include_low);
986
987        if exclude_positions.is_empty() {
988            return !matchables.is_empty();
989        }
990
991        let mut matchable_set = matchables;
992        for span in exclude_positions {
993            matchable_set.remove_span(span);
994        }
995
996        !matchable_set.is_empty()
997    }
998
999    pub fn matchables(&self, include_low: bool) -> PositionSet {
1000        if include_low {
1001            if let Some(ref cached) = *self.combined_matchables.borrow() {
1002                return cached.clone();
1003            }
1004            let combined = self.low_matchables().union(&self.high_matchables());
1005            *self.combined_matchables.borrow_mut() = Some(combined.clone());
1006            combined
1007        } else {
1008            self.high_matchables()
1009        }
1010    }
1011
1012    pub fn matchable_tokens(&self) -> Vec<Option<TokenId>> {
1013        let high_matchables = self.high_matchables();
1014        if high_matchables.is_empty() {
1015            return Vec::new();
1016        }
1017
1018        let matchables = self.matchables(true);
1019        self.tokens_with_pos()
1020            .map(|(pos, tid)| {
1021                if matchables.contains(pos) {
1022                    Some(tid)
1023                } else {
1024                    None
1025                }
1026            })
1027            .collect()
1028    }
1029
1030    pub fn high_matchables(&self) -> PositionSet {
1031        self.cached_high_matchables
1032            .get_or_init(|| {
1033                let start = self.start;
1034                let end = self.end.map(|e| e + 1).unwrap_or(usize::MAX);
1035                let source = self.source_high_matchables();
1036                let live_span = PositionSpan::new(start, end);
1037                source
1038                    .iter()
1039                    .filter(|&pos| live_span.contains(pos))
1040                    .collect()
1041            })
1042            .clone()
1043    }
1044
1045    pub fn low_matchables(&self) -> PositionSet {
1046        self.cached_low_matchables
1047            .get_or_init(|| {
1048                let start = self.start;
1049                let end = self.end.map(|e| e + 1).unwrap_or(usize::MAX);
1050                let source = self.source_low_matchables();
1051                let live_span = PositionSpan::new(start, end);
1052                source
1053                    .iter()
1054                    .filter(|&pos| live_span.contains(pos))
1055                    .collect()
1056            })
1057            .clone()
1058    }
1059}
1060
1061#[cfg(test)]
1062mod test;
provenant/license_detection/query/mod.rs

provenant/license_detection/query/
mod.rs