provenant/license_detection/query/
mod.rs

1//! Query processing - tokenized input for license matching.
2
3use crate::license_detection::index::LicenseIndex;
4use crate::license_detection::index::dictionary::{KnownToken, QueryToken, TokenId, TokenKind};
5use crate::license_detection::models::PositionSpan;
6use crate::license_detection::position_set::PositionSet;
7use crate::license_detection::spdx_lid::split_spdx_lid;
8use crate::license_detection::tokenize::STOPWORDS;
9use crate::license_detection::tokenize::tokenize_as_ids;
10use once_cell::sync::Lazy;
11use regex::Regex;
12use std::cell::{OnceCell, RefCell};
13use std::collections::HashMap;
14
15static QUERY_PATTERN: Lazy<Regex> =
16    Lazy::new(|| Regex::new(r"[^_\W]+\+?[^_\W]*").expect("valid query regex"));
17static MATCHED_TEXT_PATTERN: Lazy<Regex> = Lazy::new(|| {
18    Regex::new(r"(?P<token>[^_\W]+\+?[^_\W]*)|(?P<punct>[_\W\s\+]+[_\W\s]?)")
19        .expect("valid matched text regex")
20});
21
22#[derive(Clone)]
23struct MatchedTextToken {
24    value: String,
25    line_num: usize,
26    pos: Option<usize>,
27    is_text: bool,
28    is_matched: bool,
29}
30
31///
32/// Query holds:
33/// - Known token IDs (tokens existing in the index dictionary)
34/// - Token positions and their corresponding line numbers (line_by_pos)
35/// - Unknown tokens (tokens not in dictionary) tracked per position
36/// - Stopwords tracked per position
37/// - Positions with short/digit-only tokens
38/// - High and low matchable token positions (for tracking what's been matched)
39///
40/// Based on Python Query class at:
41/// reference/scancode-toolkit/src/licensedcode/query.py (lines 155-295)
42#[derive(Debug)]
43pub struct Query<'a> {
44    /// The original input text.
45    ///
46    /// Corresponds to Python: `self.query_string` (line 215)
47    pub text: String,
48
49    /// Token IDs for known tokens (tokens found in the index dictionary)
50    ///
51    /// Corresponds to Python: `self.tokens = []` (line 228)
52    pub tokens: Vec<TokenId>,
53
54    /// Mapping from token position to line number (1-based)
55    ///
56    /// Each token position in `self.tokens` maps to the line number where it appears.
57    /// This is used for match position reporting.
58    ///
59    /// Corresponds to Python: `self.line_by_pos = []` (line 231)
60    pub line_by_pos: Vec<usize>,
61
62    /// Mapping from token position to count of unknown tokens after that position
63    ///
64    /// Unknown tokens are those not found in the dictionary. We track them by
65    /// counting how many unknown tokens appear after each known position.
66    /// Unknown tokens before the first known token are tracked with the key `None`.
67    ///
68    /// Corresponds to Python: `self.unknowns_by_pos = {}` (line 236)
69    pub unknowns_by_pos: HashMap<Option<usize>, usize>,
70
71    /// Mapping from token position to count of stopwords after that position
72    ///
73    /// Similar to unknown_tokens, but for stopwords.
74    ///
75    /// Corresponds to Python: `self.stopwords_by_pos = {}` (line 244)
76    pub stopwords_by_pos: HashMap<Option<usize>, usize>,
77
78    /// Set of positions with single-character or digit-only tokens
79    ///
80    /// These tokens have special handling in matching.
81    ///
82    /// Corresponds to Python: `self.shorts_and_digits_pos = set()` (line 249)
83    pub shorts_and_digits_pos: PositionSet,
84
85    /// High-value matchable token positions (legalese tokens)
86    ///
87    /// These are tokens with ID < len_legalese.
88    ///
89    /// Corresponds to Python: `self.high_matchables` (line 293)
90    pub high_matchables: PositionSet,
91
92    /// Low-value matchable token positions (non-legalese tokens)
93    ///
94    /// These are tokens with ID >= len_legalese.
95    ///
96    /// Corresponds to Python: `self.low_matchables` (line 294)
97    pub low_matchables: PositionSet,
98
99    /// True if the query is detected as binary content
100    ///
101    /// Corresponds to Python: `self.is_binary = False` (line 225)
102    pub is_binary: bool,
103
104    /// Raw query run ranges (start, end) computed during tokenization.
105    ///
106    /// QueryRuns are created on-demand from these ranges.
107    ///
108    /// Corresponds to Python: `self.query_runs = []` (line 274)
109    pub(crate) query_run_ranges: Vec<(usize, Option<usize>)>,
110
111    /// SPDX-License-Identifier lines found during tokenization.
112    ///
113    /// Each tuple is (spdx_text, start_token_pos, end_token_pos).
114    /// Used for creating LicenseMatches with correct token positions.
115    ///
116    /// Corresponds to Python: `self.spdx_lines = []` (line 507)
117    pub spdx_lines: Vec<(String, usize, usize)>,
118
119    /// Reference to the license index for dictionary access and metadata
120    pub index: &'a LicenseIndex,
121}
122
123pub fn matched_text_from_text(text: &str, start_line: usize, end_line: usize) -> String {
124    if start_line == 0 || end_line == 0 || start_line > end_line {
125        return String::new();
126    }
127
128    text.lines()
129        .enumerate()
130        .filter_map(|(idx, line)| {
131            let line_num = idx + 1;
132            if line_num >= start_line && line_num <= end_line {
133                Some(line)
134            } else {
135                None
136            }
137        })
138        .collect::<Vec<_>>()
139        .join("\n")
140}
141
142pub fn matched_text_diagnostics_from_text(
143    text: &str,
144    query: &Query<'_>,
145    matched_positions: &PositionSet,
146    start_pos: usize,
147    end_pos: usize,
148    start_line: usize,
149    end_line: usize,
150) -> String {
151    let tokens = tokenize_matched_text(text, query);
152    let reportable_tokens = collect_reportable_tokens(
153        tokens,
154        matched_positions,
155        start_pos,
156        end_pos,
157        start_line,
158        end_line,
159    );
160    let line_endings = collect_line_endings(text);
161
162    render_diagnostic_tokens(&reportable_tokens, &line_endings)
163}
164
165fn tokenize_matched_text(text: &str, query: &Query<'_>) -> Vec<MatchedTextToken> {
166    let mut tokens = Vec::new();
167    let mut pos = 0usize;
168    let mut line_num = 1usize;
169
170    for line in text.split_inclusive('\n') {
171        for capture in MATCHED_TEXT_PATTERN.captures_iter(line) {
172            if let Some(token_match) = capture.name("token") {
173                let token_text = token_match.as_str();
174                let retokenized: Vec<String> = QUERY_PATTERN
175                    .find_iter(&token_text.to_lowercase())
176                    .map(|m| m.as_str().to_string())
177                    .filter(|token| !STOPWORDS.contains(token.as_str()))
178                    .collect();
179
180                if retokenized.is_empty() {
181                    tokens.push(MatchedTextToken {
182                        value: token_text.to_string(),
183                        line_num,
184                        pos: None,
185                        is_text: true,
186                        is_matched: false,
187                    });
188                } else if retokenized.len() == 1 {
189                    let token = &retokenized[0];
190                    let token_pos = if query.index.dictionary.get(token).is_some() {
191                        let current_pos = pos;
192                        pos += 1;
193                        Some(current_pos)
194                    } else {
195                        None
196                    };
197
198                    tokens.push(MatchedTextToken {
199                        value: token_text.to_string(),
200                        line_num,
201                        pos: token_pos,
202                        is_text: true,
203                        is_matched: false,
204                    });
205                } else {
206                    for token in retokenized {
207                        let token_pos = if query.index.dictionary.get(&token).is_some() {
208                            let current_pos = pos;
209                            pos += 1;
210                            Some(current_pos)
211                        } else {
212                            None
213                        };
214
215                        tokens.push(MatchedTextToken {
216                            value: token,
217                            line_num,
218                            pos: token_pos,
219                            is_text: true,
220                            is_matched: false,
221                        });
222                    }
223                }
224            } else if let Some(punct_match) = capture.name("punct") {
225                tokens.push(MatchedTextToken {
226                    value: punct_match.as_str().to_string(),
227                    line_num,
228                    pos: None,
229                    is_text: false,
230                    is_matched: false,
231                });
232            }
233        }
234
235        line_num += 1;
236    }
237
238    tokens
239}
240
241fn collect_reportable_tokens(
242    tokens: Vec<MatchedTextToken>,
243    matched_positions: &PositionSet,
244    start_pos: usize,
245    end_pos: usize,
246    start_line: usize,
247    end_line: usize,
248) -> Vec<MatchedTextToken> {
249    let mut reportable = Vec::new();
250    let mut started = false;
251    let mut finished = false;
252    let mut end_real_pos = None;
253    let mut last_real_pos = None;
254
255    for (real_pos, mut token) in tokens.into_iter().enumerate() {
256        if token.line_num < start_line {
257            continue;
258        }
259
260        if token.line_num > end_line {
261            break;
262        }
263
264        let mut is_included = false;
265
266        if token.pos.is_some_and(|pos| matched_positions.contains(pos)) {
267            token.is_matched = true;
268            is_included = true;
269        }
270
271        if !started && token.pos == Some(start_pos) {
272            started = true;
273            is_included = true;
274        }
275
276        if started && !finished {
277            is_included = true;
278        }
279
280        if token.pos == Some(end_pos) {
281            finished = true;
282            started = false;
283            end_real_pos = Some(real_pos);
284        }
285
286        if finished && !started && end_real_pos.is_some() && last_real_pos == end_real_pos {
287            end_real_pos = None;
288            if !token.is_text && !token.value.trim().is_empty() {
289                is_included = true;
290            }
291        }
292
293        last_real_pos = Some(real_pos);
294
295        if is_included {
296            reportable.push(token);
297        }
298    }
299
300    reportable
301}
302
303fn collect_line_endings(text: &str) -> Vec<String> {
304    text.split_inclusive('\n')
305        .map(|line| {
306            if line.ends_with("\r\n") {
307                "\r\n".to_string()
308            } else if line.ends_with('\n') {
309                "\n".to_string()
310            } else {
311                String::new()
312            }
313        })
314        .collect()
315}
316
317fn render_diagnostic_tokens(tokens: &[MatchedTextToken], line_endings: &[String]) -> String {
318    let mut rendered = String::new();
319    let mut previous_line: Option<usize> = None;
320
321    for token in tokens {
322        if let Some(prev_line) = previous_line
323            && token.line_num > prev_line
324        {
325            for line in prev_line..token.line_num {
326                if let Some(line_ending) = line_endings.get(line.saturating_sub(1)) {
327                    rendered.push_str(line_ending.as_str());
328                }
329            }
330        }
331
332        let token_value = if token.is_text {
333            token.value.as_str()
334        } else {
335            token
336                .value
337                .strip_suffix("\r\n")
338                .or_else(|| token.value.strip_suffix('\n'))
339                .unwrap_or(token.value.as_str())
340        };
341
342        if token.is_text && !STOPWORDS.contains(token.value.to_lowercase().as_str()) {
343            if token.is_matched {
344                rendered.push_str(token_value);
345            } else {
346                rendered.push('[');
347                rendered.push_str(token_value);
348                rendered.push(']');
349            }
350        } else {
351            rendered.push_str(token_value);
352        }
353
354        previous_line = Some(token.line_num);
355    }
356
357    rendered
358}
359
360impl<'a> Query<'a> {
361    /// Create a new query from text string and license index.
362    ///
363    /// This tokenizes the input text, looks up each token in the index dictionary,
364    /// and builds the query structures for matching.
365    ///
366    /// # Arguments
367    /// * `text` - The input text to tokenize
368    /// * `index` - The license index containing the token dictionary
369    ///
370    /// # Returns
371    /// A Result containing the Query or an error if binary detection fails
372    ///
373    /// Detection scans file-like text, so this uses Python's
374    /// `build_query(..., text_line_threshold=15)` threshold.
375    const TEXT_LINE_THRESHOLD: usize = 15;
376    const BINARY_LINE_THRESHOLD: usize = 50;
377    const MAX_TOKEN_PER_LINE: usize = 25;
378
379    fn compute_spdx_offset(
380        tokens: &[QueryToken],
381        dictionary: &crate::license_detection::index::dictionary::TokenDictionary,
382    ) -> Option<usize> {
383        let get_known_id = |i: usize| -> Option<TokenId> {
384            match tokens.get(i)? {
385                QueryToken::Known(known) => Some(known.id),
386                _ => None,
387            }
388        };
389
390        let spdx_id = dictionary.get("spdx")?;
391        let license_id = dictionary.get("license")?;
392        let identifier_id = dictionary.get("identifier")?;
393        let licence_id = dictionary.get("licence");
394
395        let licenses_id = dictionary.get("licenses");
396        let nuget_id = dictionary.get("nuget");
397        let org_id = dictionary.get("org");
398
399        let is_spdx_prefix = |ids: [Option<TokenId>; 3]| -> bool {
400            ids.iter().all(|id| id.is_some())
401                && ids[0] == Some(spdx_id)
402                && (ids[1] == Some(license_id) || ids[1] == licence_id)
403                && ids[2] == Some(identifier_id)
404        };
405
406        let is_nuget_prefix = |ids: [Option<TokenId>; 3]| -> bool {
407            licenses_id.is_some()
408                && nuget_id.is_some()
409                && org_id.is_some()
410                && ids[0] == licenses_id
411                && ids[1] == Some(nuget_id.unwrap())
412                && ids[2] == Some(org_id.unwrap())
413        };
414
415        if tokens.len() >= 3 {
416            let first_three = [get_known_id(0), get_known_id(1), get_known_id(2)];
417            if is_spdx_prefix(first_three) || is_nuget_prefix(first_three) {
418                return Some(0);
419            }
420        }
421
422        if tokens.len() >= 4 {
423            let second_three = [get_known_id(1), get_known_id(2), get_known_id(3)];
424            if is_spdx_prefix(second_three) || is_nuget_prefix(second_three) {
425                return Some(1);
426            }
427        }
428
429        if tokens.len() >= 5 {
430            let third_three = [get_known_id(2), get_known_id(3), get_known_id(4)];
431            if is_spdx_prefix(third_three) || is_nuget_prefix(third_three) {
432                return Some(2);
433            }
434        }
435
436        None
437    }
438
439    pub fn from_extracted_text(
440        text: &str,
441        index: &'a LicenseIndex,
442        binary_derived: bool,
443    ) -> Result<Self, anyhow::Error> {
444        let line_threshold = if binary_derived {
445            Self::BINARY_LINE_THRESHOLD
446        } else {
447            Self::TEXT_LINE_THRESHOLD
448        };
449
450        Self::with_source_options(text, index, line_threshold, Some(binary_derived))
451    }
452
453    /// Iterate over query runs.
454    ///
455    /// Corresponds to Python: `query.query_runs` property iteration
456    pub fn query_runs(&self) -> Vec<QueryRun<'_>> {
457        self.query_run_ranges
458            .iter()
459            .map(|&(start, end)| QueryRun::new(self, start, end))
460            .collect()
461    }
462
463    fn with_source_options(
464        text: &str,
465        index: &'a LicenseIndex,
466        line_threshold: usize,
467        binary_derived: Option<bool>,
468    ) -> Result<Self, anyhow::Error> {
469        let is_binary = match binary_derived {
470            Some(is_binary) => is_binary,
471            None => Self::detect_binary(text)?,
472        };
473        let has_long_lines = Self::detect_long_lines(text);
474
475        let mut tokens = Vec::new();
476        let mut line_by_pos = Vec::new();
477        let mut unknowns_by_pos: HashMap<Option<usize>, usize> = HashMap::new();
478        let mut stopwords_by_pos: HashMap<Option<usize>, usize> = HashMap::new();
479        let mut shorts_and_digits_pos = PositionSet::new();
480        let mut spdx_lines: Vec<(String, usize, usize)> = Vec::new();
481
482        let mut known_pos: Option<usize> = None;
483        let mut started = false;
484        let mut current_line = 1usize;
485
486        let mut tokens_by_line: Vec<Vec<Option<KnownToken>>> = Vec::new();
487
488        for line in text.lines() {
489            let line_trimmed = line.trim();
490            let mut line_tokens: Vec<Option<KnownToken>> = Vec::new();
491
492            let mut line_first_known_pos = None;
493
494            let line_query_tokens = tokenize_as_ids(line_trimmed, &index.dictionary);
495
496            for query_token in &line_query_tokens {
497                match query_token {
498                    QueryToken::Known(known_token) => {
499                        known_pos = Some(known_pos.map_or(0, |p| p + 1));
500                        started = true;
501                        tokens.push(known_token.id);
502                        line_by_pos.push(current_line);
503                        line_tokens.push(Some(*known_token));
504
505                        if line_first_known_pos.is_none() {
506                            line_first_known_pos = known_pos;
507                        }
508
509                        if known_token.is_short_or_digit {
510                            let _ = shorts_and_digits_pos.insert(known_pos.unwrap());
511                        }
512                    }
513                    QueryToken::Unknown if !started => {
514                        *unknowns_by_pos.entry(None).or_insert(0) += 1;
515                        line_tokens.push(None);
516                    }
517                    QueryToken::Unknown => {
518                        *unknowns_by_pos.entry(known_pos).or_insert(0) += 1;
519                        line_tokens.push(None);
520                    }
521                    QueryToken::Stopword if !started => {
522                        *stopwords_by_pos.entry(None).or_insert(0) += 1;
523                    }
524                    QueryToken::Stopword => {
525                        *stopwords_by_pos.entry(known_pos).or_insert(0) += 1;
526                    }
527                }
528            }
529
530            let line_last_known_pos = known_pos;
531
532            let spdx_start_offset =
533                Self::compute_spdx_offset(&line_query_tokens, &index.dictionary);
534
535            if let Some(offset) = spdx_start_offset
536                && let Some(line_first_known_pos) = line_first_known_pos
537            {
538                let (spdx_prefix, spdx_expression) = split_spdx_lid(line);
539                let spdx_text = format!("{}{}", spdx_prefix.unwrap_or_default(), spdx_expression);
540                let spdx_start_known_pos = line_first_known_pos + offset;
541
542                if spdx_start_known_pos <= line_last_known_pos.unwrap() {
543                    let spdx_end = line_last_known_pos.unwrap() + 1;
544                    spdx_lines.push((spdx_text, spdx_start_known_pos, spdx_end));
545                }
546            }
547
548            tokens_by_line.push(line_tokens);
549            current_line += 1;
550        }
551
552        let high_matchables: PositionSet = tokens
553            .iter()
554            .enumerate()
555            .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Legalese)
556            .map(|(pos, _tid)| pos)
557            .collect();
558
559        let low_matchables: PositionSet = tokens
560            .iter()
561            .enumerate()
562            .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Regular)
563            .map(|(pos, _tid)| pos)
564            .collect();
565
566        let query_runs = Self::compute_query_runs(&tokens_by_line, line_threshold, has_long_lines);
567
568        Ok(Query {
569            text: text.to_string(),
570            tokens,
571            line_by_pos,
572            unknowns_by_pos,
573            stopwords_by_pos,
574            shorts_and_digits_pos,
575            high_matchables,
576            low_matchables,
577            is_binary,
578            query_run_ranges: query_runs,
579            spdx_lines,
580            index,
581        })
582    }
583
584    /// Detect if text is binary content.
585    ///
586    /// Binary detection checks for:
587    /// - Null bytes (0x00)
588    /// - High ratio of non-printable characters
589    ///
590    /// # Arguments
591    /// * `text` - The text to analyze
592    ///
593    /// # Returns
594    /// true if binary, false otherwise
595    ///
596    /// Corresponds to Python: `typecode.get_type().is_binary` usage (lines 123-135)
597    fn detect_binary(text: &str) -> Result<bool, anyhow::Error> {
598        let null_byte_count = text.bytes().filter(|&b| b == 0).count();
599
600        if null_byte_count > 0 {
601            return Ok(true);
602        }
603
604        let non_printable_ratio = text
605            .chars()
606            .filter(|&c| {
607                !c.is_ascii() && !c.is_ascii_graphic() && c != '\n' && c != '\r' && c != '\t'
608            })
609            .count() as f64
610            / text.len().max(1) as f64;
611
612        Ok(non_printable_ratio > 0.3)
613    }
614
615    /// Detect if text has very long lines (for minified JS/CSS).
616    ///
617    /// # Arguments
618    /// * `text` - The text to analyze
619    ///
620    /// # Returns
621    /// true if there are lines with many tokens, false otherwise
622    ///
623    /// Corresponds to Python: `typecode.get_type().is_text_with_long_lines` usage
624    fn detect_long_lines(text: &str) -> bool {
625        text.lines()
626            .any(|line| crate::license_detection::tokenize::count_tokens(line) > 25)
627    }
628
629    fn break_long_lines(lines: &[Vec<Option<KnownToken>>]) -> Vec<Vec<Option<KnownToken>>> {
630        lines
631            .iter()
632            .flat_map(|line| {
633                if line.is_empty() {
634                    return Vec::new();
635                }
636
637                if line.len() <= Self::MAX_TOKEN_PER_LINE {
638                    vec![line.clone()]
639                } else {
640                    line.chunks(Self::MAX_TOKEN_PER_LINE)
641                        .map(|chunk| chunk.to_vec())
642                        .collect()
643                }
644            })
645            .collect()
646    }
647
648    fn compute_query_runs(
649        tokens_by_line: &[Vec<Option<KnownToken>>],
650        line_threshold: usize,
651        has_long_lines: bool,
652    ) -> Vec<(usize, Option<usize>)> {
653        let processed_lines = if has_long_lines {
654            Self::break_long_lines(tokens_by_line)
655        } else {
656            tokens_by_line.to_vec()
657        };
658
659        let mut query_runs = Vec::new();
660        let mut query_run_start = 0usize;
661        let mut query_run_end = None;
662        let mut empty_lines = 0usize;
663        let mut pos = 0usize;
664        let mut query_run_is_all_digit = true;
665
666        for line_tokens in processed_lines {
667            if query_run_end.is_some() && empty_lines >= line_threshold {
668                if !query_run_is_all_digit {
669                    query_runs.push((query_run_start, query_run_end));
670                }
671                query_run_start = pos;
672                query_run_end = None;
673                empty_lines = 0;
674                query_run_is_all_digit = true;
675            }
676
677            if query_run_end.is_none() {
678                query_run_start = pos;
679            }
680
681            if line_tokens.is_empty() {
682                empty_lines += 1;
683                continue;
684            }
685
686            let line_is_all_digit = line_tokens
687                .iter()
688                .all(|token_id| token_id.map(|known| known.is_digit_only).unwrap_or(true));
689            let mut line_has_known_tokens = false;
690            let mut line_has_good_tokens = false;
691
692            for known in line_tokens.into_iter().flatten() {
693                line_has_known_tokens = true;
694                if known.kind == TokenKind::Legalese {
695                    line_has_good_tokens = true;
696                }
697                if !known.is_digit_only {
698                    query_run_is_all_digit = false;
699                }
700                query_run_end = Some(pos);
701                pos += 1;
702            }
703
704            if line_is_all_digit || !line_has_known_tokens {
705                empty_lines += 1;
706                continue;
707            }
708
709            if line_has_good_tokens {
710                empty_lines = 0;
711            } else {
712                empty_lines += 1;
713            }
714        }
715
716        if let Some(end) = query_run_end
717            && !query_run_is_all_digit
718        {
719            query_runs.push((query_run_start, Some(end)));
720        }
721
722        query_runs
723    }
724
725    /// Get the length of the query in tokens.
726    ///
727    /// Get the line number for a token position.
728    ///
729    /// # Arguments
730    /// * `pos` - The token position
731    ///
732    /// # Returns
733    /// The line number (1-based)
734    #[inline]
735    pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
736        self.line_by_pos.get(pos).copied()
737    }
738
739    /// Check if the query is empty (no known tokens).
740    #[inline]
741    pub fn is_empty(&self) -> bool {
742        self.tokens.is_empty()
743    }
744
745    /// Get a query run covering the entire query.
746    ///
747    /// Corresponds to Python: `whole_query_run()` method (lines 306-317)
748    pub fn whole_query_run(&self) -> QueryRun<'a> {
749        QueryRun::whole_query_snapshot(self)
750    }
751
752    /// Subtract matched span positions from matchables.
753    ///
754    /// This removes the positions from both high and low matchables.
755    ///
756    /// # Arguments
757    /// * `span` - The span of positions to subtract
758    ///
759    /// Corresponds to Python: `subtract()` method (lines 328-334)
760    pub fn subtract(&mut self, span: &PositionSpan) {
761        self.high_matchables.remove_span(span);
762        self.low_matchables.remove_span(span);
763    }
764
765    /// Extract matched text for a given line range.
766    ///
767    /// Returns the text from the original input between start_line and end_line
768    /// (both inclusive, 1-indexed).
769    ///
770    /// # Arguments
771    /// * `start_line` - Starting line number (1-indexed)
772    /// * `end_line` - Ending line number (1-indexed)
773    ///
774    /// # Returns
775    /// The matched text, or empty string if lines are out of range
776    ///
777    /// Corresponds to Python: `matched_text()` method in match.py (lines 757-795)
778    pub fn matched_text(&self, start_line: usize, end_line: usize) -> String {
779        matched_text_from_text(&self.text, start_line, end_line)
780    }
781}
782
783#[derive(Debug, Clone)]
784struct WholeQueryRunSnapshot<'a> {
785    index: &'a LicenseIndex,
786    tokens: Vec<TokenId>,
787    line_by_pos: Vec<usize>,
788    high_matchables: PositionSet,
789    low_matchables: PositionSet,
790}
791
792/// A query run is a slice of query tokens identified by a start and end positions.
793///
794/// Query runs break a query into manageable chunks for efficient matching.
795/// They track matchable token positions and support subtraction of matched spans.
796///
797/// Based on Python QueryRun class at:
798/// reference/scancode-toolkit/src/licensedcode/query.py (lines 720-914)
799#[derive(Debug, Clone)]
800pub struct QueryRun<'a> {
801    query: Option<&'a Query<'a>>,
802    whole_query_snapshot: Option<WholeQueryRunSnapshot<'a>>,
803    pub start: usize,
804    pub end: Option<usize>,
805    cached_high_matchables: OnceCell<PositionSet>,
806    cached_low_matchables: OnceCell<PositionSet>,
807    combined_matchables: RefCell<Option<PositionSet>>,
808}
809
810impl<'a> QueryRun<'a> {
811    /// Create a new query run from a query with start and end positions.
812    ///
813    /// # Arguments
814    /// * `query` - The parent query
815    /// * `start` - The start position (inclusive)
816    /// * `end` - The end position (inclusive), or None for an empty run
817    ///
818    /// Corresponds to Python: `QueryRun.__init__()` (lines 735-749)
819    pub fn new(query: &'a Query<'a>, start: usize, end: Option<usize>) -> Self {
820        Self {
821            query: Some(query),
822            whole_query_snapshot: None,
823            start,
824            end,
825            cached_high_matchables: OnceCell::new(),
826            cached_low_matchables: OnceCell::new(),
827            combined_matchables: RefCell::new(None),
828        }
829    }
830
831    fn whole_query_snapshot(query: &Query<'a>) -> Self {
832        let end = if query.is_empty() {
833            None
834        } else {
835            Some(query.tokens.len() - 1)
836        };
837
838        Self {
839            query: None,
840            whole_query_snapshot: Some(WholeQueryRunSnapshot {
841                index: query.index,
842                tokens: query.tokens.clone(),
843                line_by_pos: query.line_by_pos.clone(),
844                high_matchables: query.high_matchables.clone(),
845                low_matchables: query.low_matchables.clone(),
846            }),
847            start: 0,
848            end,
849            cached_high_matchables: OnceCell::new(),
850            cached_low_matchables: OnceCell::new(),
851            combined_matchables: RefCell::new(None),
852        }
853    }
854
855    fn source_tokens(&self) -> &[TokenId] {
856        if let Some(query) = self.query {
857            &query.tokens
858        } else {
859            &self
860                .whole_query_snapshot
861                .as_ref()
862                .expect("snapshot-backed whole query run should have snapshot data")
863                .tokens
864        }
865    }
866
867    fn source_line_by_pos(&self) -> &[usize] {
868        if let Some(query) = self.query {
869            &query.line_by_pos
870        } else {
871            &self
872                .whole_query_snapshot
873                .as_ref()
874                .expect("snapshot-backed whole query run should have snapshot data")
875                .line_by_pos
876        }
877    }
878
879    fn source_high_matchables(&self) -> &PositionSet {
880        if let Some(query) = self.query {
881            &query.high_matchables
882        } else {
883            &self
884                .whole_query_snapshot
885                .as_ref()
886                .expect("snapshot-backed whole query run should have snapshot data")
887                .high_matchables
888        }
889    }
890
891    fn source_low_matchables(&self) -> &PositionSet {
892        if let Some(query) = self.query {
893            &query.low_matchables
894        } else {
895            &self
896                .whole_query_snapshot
897                .as_ref()
898                .expect("snapshot-backed whole query run should have snapshot data")
899                .low_matchables
900        }
901    }
902
903    /// Get the license index used by this query run.
904    pub fn get_index(&self) -> &LicenseIndex {
905        if let Some(query) = self.query {
906            query.index
907        } else {
908            self.whole_query_snapshot
909                .as_ref()
910                .expect("snapshot-backed whole query run should have snapshot data")
911                .index
912        }
913    }
914
915    /// Get the line number for a specific token position.
916    ///
917    /// # Arguments
918    /// * `pos` - Absolute token position in the query
919    ///
920    /// # Returns
921    /// The line number (1-based), or None if position is out of range
922    pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
923        self.source_line_by_pos().get(pos).copied()
924    }
925
926    /// Get the sequence of token IDs for this run.
927    ///
928    /// Returns empty slice if end is None.
929    ///
930    /// Corresponds to Python: `tokens` property (lines 779-786)
931    pub fn tokens(&self) -> &[TokenId] {
932        match self.end {
933            Some(end) => &self.source_tokens()[self.start..=end],
934            None => &[],
935        }
936    }
937
938    /// Iterate over token IDs with their absolute positions.
939    ///
940    /// Corresponds to Python: `tokens_with_pos()` method (lines 788-789)
941    pub fn tokens_with_pos(&self) -> impl Iterator<Item = (usize, TokenId)> + '_ {
942        self.tokens()
943            .iter()
944            .copied()
945            .enumerate()
946            .map(|(i, tid)| (self.start + i, tid))
947    }
948
949    /// Check if this query run contains only digit tokens.
950    ///
951    /// Corresponds to Python: `is_digits_only()` method (lines 791-796)
952    pub fn is_digits_only(&self) -> bool {
953        self.tokens()
954            .iter()
955            .all(|&tid| self.get_index().dictionary.is_digit_only_token(tid))
956    }
957
958    /// Check if this query run has matchable tokens.
959    ///
960    /// # Arguments
961    /// * `include_low` - If true, include low-value tokens in the check
962    /// * `exclude_positions` - Optional set of spans containing positions to exclude
963    ///
964    /// Returns true if there are matchable tokens remaining
965    ///
966    /// Corresponds to Python: `is_matchable()` method (lines 798-818)
967    pub fn is_matchable(&self, include_low: bool, exclude_positions: &[PositionSpan]) -> bool {
968        if self.is_digits_only() {
969            return false;
970        }
971
972        let matchables = self.matchables(include_low);
973
974        if exclude_positions.is_empty() {
975            return !matchables.is_empty();
976        }
977
978        let mut matchable_set = matchables;
979        for span in exclude_positions {
980            matchable_set.remove_span(span);
981        }
982
983        !matchable_set.is_empty()
984    }
985
986    pub fn matchables(&self, include_low: bool) -> PositionSet {
987        if include_low {
988            if let Some(ref cached) = *self.combined_matchables.borrow() {
989                return cached.clone();
990            }
991            let combined = self.low_matchables().union(&self.high_matchables());
992            *self.combined_matchables.borrow_mut() = Some(combined.clone());
993            combined
994        } else {
995            self.high_matchables()
996        }
997    }
998
999    pub fn matchable_tokens(&self) -> Vec<Option<TokenId>> {
1000        let high_matchables = self.high_matchables();
1001        if high_matchables.is_empty() {
1002            return Vec::new();
1003        }
1004
1005        let matchables = self.matchables(true);
1006        self.tokens_with_pos()
1007            .map(|(pos, tid)| {
1008                if matchables.contains(pos) {
1009                    Some(tid)
1010                } else {
1011                    None
1012                }
1013            })
1014            .collect()
1015    }
1016
1017    pub fn high_matchables(&self) -> PositionSet {
1018        self.cached_high_matchables
1019            .get_or_init(|| {
1020                let start = self.start;
1021                let end = self.end.map(|e| e + 1).unwrap_or(usize::MAX);
1022                let source = self.source_high_matchables();
1023                let live_span = PositionSpan::new(start, end);
1024                source
1025                    .iter()
1026                    .filter(|&pos| live_span.contains(pos))
1027                    .collect()
1028            })
1029            .clone()
1030    }
1031
1032    pub fn low_matchables(&self) -> PositionSet {
1033        self.cached_low_matchables
1034            .get_or_init(|| {
1035                let start = self.start;
1036                let end = self.end.map(|e| e + 1).unwrap_or(usize::MAX);
1037                let source = self.source_low_matchables();
1038                let live_span = PositionSpan::new(start, end);
1039                source
1040                    .iter()
1041                    .filter(|&pos| live_span.contains(pos))
1042                    .collect()
1043            })
1044            .clone()
1045    }
1046}
1047
1048#[cfg(test)]
1049mod test;
provenant/license_detection/query/mod.rs

provenant/license_detection/query/
mod.rs