provenant/license_detection/query/
mod.rs

1//! Query processing - tokenized input for license matching.
2
3use crate::license_detection::index::LicenseIndex;
4use crate::license_detection::index::dictionary::{KnownToken, QueryToken, TokenId, TokenKind};
5use crate::license_detection::spdx_lid::split_spdx_lid;
6use crate::license_detection::tokenize::STOPWORDS;
7use crate::license_detection::tokenize::tokenize_as_ids;
8use bit_set::BitSet;
9use once_cell::sync::Lazy;
10use regex::Regex;
11use std::cell::{OnceCell, RefCell};
12use std::collections::{HashMap, HashSet};
13
14static QUERY_PATTERN: Lazy<Regex> =
15    Lazy::new(|| Regex::new(r"[^_\W]+\+?[^_\W]*").expect("valid query regex"));
16static MATCHED_TEXT_PATTERN: Lazy<Regex> = Lazy::new(|| {
17    Regex::new(r"(?P<token>[^_\W]+\+?[^_\W]*)|(?P<punct>[_\W\s\+]+[_\W\s]?)")
18        .expect("valid matched text regex")
19});
20
21#[derive(Clone)]
22struct MatchedTextToken {
23    value: String,
24    line_num: usize,
25    pos: Option<usize>,
26    is_text: bool,
27    is_matched: bool,
28}
29
30/// A span representing a range of token positions.
31///
32/// Used for tracking matched token positions and performing position arithmetic.
33/// This is a single continuous range of token positions (start..=end, inclusive).
34///
35/// Distinct from `spans::Span` which tracks multiple byte ranges for coverage.
36///
37/// Based on Python Span class at:
38/// reference/scancode-toolkit/src/licensedcode/spans.py
39#[derive(Debug, Clone)]
40pub struct PositionSpan {
41    start: usize,
42    end: usize,
43}
44
45impl PositionSpan {
46    pub fn new(start: usize, end: usize) -> Self {
47        Self { start, end }
48    }
49
50    pub fn contains(&self, pos: usize) -> bool {
51        self.start <= pos && pos <= self.end
52    }
53
54    pub fn iter(&self) -> impl Iterator<Item = usize> + '_ {
55        self.start..=self.end
56    }
57}
58
59///
60/// Query holds:
61/// - Known token IDs (tokens existing in the index dictionary)
62/// - Token positions and their corresponding line numbers (line_by_pos)
63/// - Unknown tokens (tokens not in dictionary) tracked per position
64/// - Stopwords tracked per position
65/// - Positions with short/digit-only tokens
66/// - High and low matchable token positions (for tracking what's been matched)
67///
68/// Based on Python Query class at:
69/// reference/scancode-toolkit/src/licensedcode/query.py (lines 155-295)
70#[derive(Debug)]
71pub struct Query<'a> {
72    /// The original input text.
73    ///
74    /// Corresponds to Python: `self.query_string` (line 215)
75    pub text: String,
76
77    /// Token IDs for known tokens (tokens found in the index dictionary)
78    ///
79    /// Corresponds to Python: `self.tokens = []` (line 228)
80    pub tokens: Vec<TokenId>,
81
82    /// Mapping from token position to line number (1-based)
83    ///
84    /// Each token position in `self.tokens` maps to the line number where it appears.
85    /// This is used for match position reporting.
86    ///
87    /// Corresponds to Python: `self.line_by_pos = []` (line 231)
88    pub line_by_pos: Vec<usize>,
89
90    /// Mapping from token position to count of unknown tokens after that position
91    ///
92    /// Unknown tokens are those not found in the dictionary. We track them by
93    /// counting how many unknown tokens appear after each known position.
94    /// Unknown tokens before the first known token are tracked at position -1
95    /// (using the key `None` in Rust).
96    ///
97    /// Corresponds to Python: `self.unknowns_by_pos = {}` (line 236)
98    pub unknowns_by_pos: HashMap<Option<i32>, usize>,
99
100    /// Mapping from token position to count of stopwords after that position
101    ///
102    /// Similar to unknown_tokens, but for stopwords.
103    ///
104    /// Corresponds to Python: `self.stopwords_by_pos = {}` (line 244)
105    pub stopwords_by_pos: HashMap<Option<i32>, usize>,
106
107    /// Set of positions with single-character or digit-only tokens
108    ///
109    /// These tokens have special handling in matching.
110    ///
111    /// Corresponds to Python: `self.shorts_and_digits_pos = set()` (line 249)
112    pub shorts_and_digits_pos: HashSet<usize>,
113
114    /// High-value matchable token positions (legalese tokens)
115    ///
116    /// These are tokens with ID < len_legalese.
117    ///
118    /// Corresponds to Python: `self.high_matchables` (line 293)
119    pub high_matchables: BitSet,
120
121    /// Low-value matchable token positions (non-legalese tokens)
122    ///
123    /// These are tokens with ID >= len_legalese.
124    ///
125    /// Corresponds to Python: `self.low_matchables` (line 294)
126    pub low_matchables: BitSet,
127
128    /// True if the query is detected as binary content
129    ///
130    /// Corresponds to Python: `self.is_binary = False` (line 225)
131    pub is_binary: bool,
132
133    /// Raw query run ranges (start, end) computed during tokenization.
134    ///
135    /// QueryRuns are created on-demand from these ranges.
136    ///
137    /// Corresponds to Python: `self.query_runs = []` (line 274)
138    pub(crate) query_run_ranges: Vec<(usize, Option<usize>)>,
139
140    /// SPDX-License-Identifier lines found during tokenization.
141    ///
142    /// Each tuple is (spdx_text, start_token_pos, end_token_pos).
143    /// Used for creating LicenseMatches with correct token positions.
144    ///
145    /// Corresponds to Python: `self.spdx_lines = []` (line 507)
146    pub spdx_lines: Vec<(String, usize, usize)>,
147
148    /// Reference to the license index for dictionary access and metadata
149    pub index: &'a LicenseIndex,
150}
151
152pub fn matched_text_from_text(text: &str, start_line: usize, end_line: usize) -> String {
153    if start_line == 0 || end_line == 0 || start_line > end_line {
154        return String::new();
155    }
156
157    text.lines()
158        .enumerate()
159        .filter_map(|(idx, line)| {
160            let line_num = idx + 1;
161            if line_num >= start_line && line_num <= end_line {
162                Some(line)
163            } else {
164                None
165            }
166        })
167        .collect::<Vec<_>>()
168        .join("\n")
169}
170
171pub fn matched_text_diagnostics_from_text(
172    text: &str,
173    query: &Query<'_>,
174    matched_positions: &HashSet<usize>,
175    start_pos: usize,
176    end_pos: usize,
177    start_line: usize,
178    end_line: usize,
179) -> String {
180    let tokens = tokenize_matched_text(text, query);
181    let reportable_tokens = collect_reportable_tokens(
182        tokens,
183        matched_positions,
184        start_pos,
185        end_pos,
186        start_line,
187        end_line,
188    );
189    let line_endings = collect_line_endings(text);
190
191    render_diagnostic_tokens(&reportable_tokens, &line_endings)
192}
193
194fn tokenize_matched_text(text: &str, query: &Query<'_>) -> Vec<MatchedTextToken> {
195    let mut tokens = Vec::new();
196    let mut pos = 0usize;
197    let mut line_num = 1usize;
198
199    for line in text.split_inclusive('\n') {
200        for capture in MATCHED_TEXT_PATTERN.captures_iter(line) {
201            if let Some(token_match) = capture.name("token") {
202                let token_text = token_match.as_str();
203                let retokenized: Vec<String> = QUERY_PATTERN
204                    .find_iter(&token_text.to_lowercase())
205                    .map(|m| m.as_str().to_string())
206                    .filter(|token| !STOPWORDS.contains(token.as_str()))
207                    .collect();
208
209                if retokenized.is_empty() {
210                    tokens.push(MatchedTextToken {
211                        value: token_text.to_string(),
212                        line_num,
213                        pos: None,
214                        is_text: true,
215                        is_matched: false,
216                    });
217                } else if retokenized.len() == 1 {
218                    let token = &retokenized[0];
219                    let token_pos = if query.index.dictionary.get(token).is_some() {
220                        let current_pos = pos;
221                        pos += 1;
222                        Some(current_pos)
223                    } else {
224                        None
225                    };
226
227                    tokens.push(MatchedTextToken {
228                        value: token_text.to_string(),
229                        line_num,
230                        pos: token_pos,
231                        is_text: true,
232                        is_matched: false,
233                    });
234                } else {
235                    for token in retokenized {
236                        let token_pos = if query.index.dictionary.get(&token).is_some() {
237                            let current_pos = pos;
238                            pos += 1;
239                            Some(current_pos)
240                        } else {
241                            None
242                        };
243
244                        tokens.push(MatchedTextToken {
245                            value: token,
246                            line_num,
247                            pos: token_pos,
248                            is_text: true,
249                            is_matched: false,
250                        });
251                    }
252                }
253            } else if let Some(punct_match) = capture.name("punct") {
254                tokens.push(MatchedTextToken {
255                    value: punct_match.as_str().to_string(),
256                    line_num,
257                    pos: None,
258                    is_text: false,
259                    is_matched: false,
260                });
261            }
262        }
263
264        line_num += 1;
265    }
266
267    tokens
268}
269
270fn collect_reportable_tokens(
271    tokens: Vec<MatchedTextToken>,
272    matched_positions: &HashSet<usize>,
273    start_pos: usize,
274    end_pos: usize,
275    start_line: usize,
276    end_line: usize,
277) -> Vec<MatchedTextToken> {
278    let mut reportable = Vec::new();
279    let mut started = false;
280    let mut finished = false;
281    let mut end_real_pos = None;
282    let mut last_real_pos = None;
283
284    for (real_pos, mut token) in tokens.into_iter().enumerate() {
285        if token.line_num < start_line {
286            continue;
287        }
288
289        if token.line_num > end_line {
290            break;
291        }
292
293        let mut is_included = false;
294
295        if token
296            .pos
297            .is_some_and(|pos| matched_positions.contains(&pos))
298        {
299            token.is_matched = true;
300            is_included = true;
301        }
302
303        if !started && token.pos == Some(start_pos) {
304            started = true;
305            is_included = true;
306        }
307
308        if started && !finished {
309            is_included = true;
310        }
311
312        if token.pos == Some(end_pos) {
313            finished = true;
314            started = false;
315            end_real_pos = Some(real_pos);
316        }
317
318        if finished && !started && end_real_pos.is_some() && last_real_pos == end_real_pos {
319            end_real_pos = None;
320            if !token.is_text && !token.value.trim().is_empty() {
321                is_included = true;
322            }
323        }
324
325        last_real_pos = Some(real_pos);
326
327        if is_included {
328            reportable.push(token);
329        }
330    }
331
332    reportable
333}
334
335fn collect_line_endings(text: &str) -> Vec<String> {
336    text.split_inclusive('\n')
337        .map(|line| {
338            if line.ends_with("\r\n") {
339                "\r\n".to_string()
340            } else if line.ends_with('\n') {
341                "\n".to_string()
342            } else {
343                String::new()
344            }
345        })
346        .collect()
347}
348
349fn render_diagnostic_tokens(tokens: &[MatchedTextToken], line_endings: &[String]) -> String {
350    let mut rendered = String::new();
351    let mut previous_line: Option<usize> = None;
352
353    for token in tokens {
354        if let Some(prev_line) = previous_line
355            && token.line_num > prev_line
356        {
357            for line in prev_line..token.line_num {
358                if let Some(line_ending) = line_endings.get(line.saturating_sub(1)) {
359                    rendered.push_str(line_ending.as_str());
360                }
361            }
362        }
363
364        let token_value = if token.is_text {
365            token.value.as_str()
366        } else {
367            token
368                .value
369                .strip_suffix("\r\n")
370                .or_else(|| token.value.strip_suffix('\n'))
371                .unwrap_or(token.value.as_str())
372        };
373
374        if token.is_text && !STOPWORDS.contains(token.value.to_lowercase().as_str()) {
375            if token.is_matched {
376                rendered.push_str(token_value);
377            } else {
378                rendered.push('[');
379                rendered.push_str(token_value);
380                rendered.push(']');
381            }
382        } else {
383            rendered.push_str(token_value);
384        }
385
386        previous_line = Some(token.line_num);
387    }
388
389    rendered
390}
391
392impl<'a> Query<'a> {
393    /// Create a new query from text string and license index.
394    ///
395    /// This tokenizes the input text, looks up each token in the index dictionary,
396    /// and builds the query structures for matching.
397    ///
398    /// # Arguments
399    /// * `text` - The input text to tokenize
400    /// * `index` - The license index containing the token dictionary
401    ///
402    /// # Returns
403    /// A Result containing the Query or an error if binary detection fails
404    ///
405    /// Detection scans file-like text, so this uses Python's
406    /// `build_query(..., text_line_threshold=15)` threshold.
407    const TEXT_LINE_THRESHOLD: usize = 15;
408    const BINARY_LINE_THRESHOLD: usize = 50;
409    const MAX_TOKEN_PER_LINE: usize = 25;
410
411    fn compute_spdx_offset(
412        tokens: &[QueryToken],
413        dictionary: &crate::license_detection::index::dictionary::TokenDictionary,
414    ) -> Option<usize> {
415        let get_known_id = |i: usize| -> Option<TokenId> {
416            match tokens.get(i)? {
417                QueryToken::Known(known) => Some(known.id),
418                _ => None,
419            }
420        };
421
422        let spdx_id = dictionary.get("spdx")?;
423        let license_id = dictionary.get("license")?;
424        let identifier_id = dictionary.get("identifier")?;
425        let licence_id = dictionary.get("licence");
426
427        let licenses_id = dictionary.get("licenses");
428        let nuget_id = dictionary.get("nuget");
429        let org_id = dictionary.get("org");
430
431        let is_spdx_prefix = |ids: [Option<TokenId>; 3]| -> bool {
432            ids.iter().all(|id| id.is_some())
433                && ids[0] == Some(spdx_id)
434                && (ids[1] == Some(license_id) || ids[1] == licence_id)
435                && ids[2] == Some(identifier_id)
436        };
437
438        let is_nuget_prefix = |ids: [Option<TokenId>; 3]| -> bool {
439            licenses_id.is_some()
440                && nuget_id.is_some()
441                && org_id.is_some()
442                && ids[0] == licenses_id
443                && ids[1] == Some(nuget_id.unwrap())
444                && ids[2] == Some(org_id.unwrap())
445        };
446
447        if tokens.len() >= 3 {
448            let first_three = [get_known_id(0), get_known_id(1), get_known_id(2)];
449            if is_spdx_prefix(first_three) || is_nuget_prefix(first_three) {
450                return Some(0);
451            }
452        }
453
454        if tokens.len() >= 4 {
455            let second_three = [get_known_id(1), get_known_id(2), get_known_id(3)];
456            if is_spdx_prefix(second_three) || is_nuget_prefix(second_three) {
457                return Some(1);
458            }
459        }
460
461        if tokens.len() >= 5 {
462            let third_three = [get_known_id(2), get_known_id(3), get_known_id(4)];
463            if is_spdx_prefix(third_three) || is_nuget_prefix(third_three) {
464                return Some(2);
465            }
466        }
467
468        None
469    }
470
471    pub fn from_extracted_text(
472        text: &str,
473        index: &'a LicenseIndex,
474        binary_derived: bool,
475    ) -> Result<Self, anyhow::Error> {
476        let line_threshold = if binary_derived {
477            Self::BINARY_LINE_THRESHOLD
478        } else {
479            Self::TEXT_LINE_THRESHOLD
480        };
481
482        Self::with_source_options(text, index, line_threshold, Some(binary_derived))
483    }
484
485    /// Iterate over query runs.
486    ///
487    /// Corresponds to Python: `query.query_runs` property iteration
488    pub fn query_runs(&self) -> Vec<QueryRun<'_>> {
489        self.query_run_ranges
490            .iter()
491            .map(|&(start, end)| QueryRun::new(self, start, end))
492            .collect()
493    }
494
495    fn with_source_options(
496        text: &str,
497        index: &'a LicenseIndex,
498        line_threshold: usize,
499        binary_derived: Option<bool>,
500    ) -> Result<Self, anyhow::Error> {
501        let is_binary = match binary_derived {
502            Some(is_binary) => is_binary,
503            None => Self::detect_binary(text)?,
504        };
505        let has_long_lines = Self::detect_long_lines(text);
506
507        let mut tokens = Vec::new();
508        let mut line_by_pos = Vec::new();
509        let mut unknowns_by_pos: HashMap<Option<i32>, usize> = HashMap::new();
510        let mut stopwords_by_pos: HashMap<Option<i32>, usize> = HashMap::new();
511        let mut shorts_and_digits_pos = HashSet::new();
512        let mut spdx_lines: Vec<(String, usize, usize)> = Vec::new();
513
514        let mut known_pos = -1i32;
515        let mut started = false;
516        let mut current_line = 1usize;
517
518        let mut tokens_by_line: Vec<Vec<Option<KnownToken>>> = Vec::new();
519
520        for line in text.lines() {
521            let line_trimmed = line.trim();
522            let mut line_tokens: Vec<Option<KnownToken>> = Vec::new();
523
524            let mut line_first_known_pos = None;
525
526            let line_query_tokens = tokenize_as_ids(line_trimmed, &index.dictionary);
527
528            for query_token in &line_query_tokens {
529                match query_token {
530                    QueryToken::Known(known_token) => {
531                        known_pos += 1;
532                        started = true;
533                        tokens.push(known_token.id);
534                        line_by_pos.push(current_line);
535                        line_tokens.push(Some(*known_token));
536
537                        if line_first_known_pos.is_none() {
538                            line_first_known_pos = Some(known_pos);
539                        }
540
541                        if known_token.is_short_or_digit {
542                            let _ = shorts_and_digits_pos.insert(known_pos as usize);
543                        }
544                    }
545                    QueryToken::Unknown if !started => {
546                        *unknowns_by_pos.entry(None).or_insert(0) += 1;
547                        line_tokens.push(None);
548                    }
549                    QueryToken::Unknown => {
550                        *unknowns_by_pos.entry(Some(known_pos)).or_insert(0) += 1;
551                        line_tokens.push(None);
552                    }
553                    QueryToken::Stopword if !started => {
554                        *stopwords_by_pos.entry(None).or_insert(0) += 1;
555                    }
556                    QueryToken::Stopword => {
557                        *stopwords_by_pos.entry(Some(known_pos)).or_insert(0) += 1;
558                    }
559                }
560            }
561
562            let line_last_known_pos = known_pos;
563
564            let spdx_start_offset =
565                Self::compute_spdx_offset(&line_query_tokens, &index.dictionary);
566
567            if let Some(offset) = spdx_start_offset
568                && let Some(line_first_known_pos) = line_first_known_pos
569            {
570                let (spdx_prefix, spdx_expression) = split_spdx_lid(line);
571                let spdx_text = format!("{}{}", spdx_prefix.unwrap_or_default(), spdx_expression);
572                let spdx_start_known_pos = line_first_known_pos + offset as i32;
573
574                if spdx_start_known_pos <= line_last_known_pos {
575                    let spdx_start = spdx_start_known_pos as usize;
576                    let spdx_end = (line_last_known_pos + 1) as usize;
577                    spdx_lines.push((spdx_text, spdx_start, spdx_end));
578                }
579            }
580
581            tokens_by_line.push(line_tokens);
582            current_line += 1;
583        }
584
585        let high_matchables: BitSet = tokens
586            .iter()
587            .enumerate()
588            .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Legalese)
589            .map(|(pos, _tid)| pos)
590            .collect();
591
592        let low_matchables: BitSet = tokens
593            .iter()
594            .enumerate()
595            .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Regular)
596            .map(|(pos, _tid)| pos)
597            .collect();
598
599        let query_runs = Self::compute_query_runs(&tokens_by_line, line_threshold, has_long_lines);
600
601        Ok(Query {
602            text: text.to_string(),
603            tokens,
604            line_by_pos,
605            unknowns_by_pos,
606            stopwords_by_pos,
607            shorts_and_digits_pos,
608            high_matchables,
609            low_matchables,
610            is_binary,
611            query_run_ranges: query_runs,
612            spdx_lines,
613            index,
614        })
615    }
616
617    /// Detect if text is binary content.
618    ///
619    /// Binary detection checks for:
620    /// - Null bytes (0x00)
621    /// - High ratio of non-printable characters
622    ///
623    /// # Arguments
624    /// * `text` - The text to analyze
625    ///
626    /// # Returns
627    /// true if binary, false otherwise
628    ///
629    /// Corresponds to Python: `typecode.get_type().is_binary` usage (lines 123-135)
630    fn detect_binary(text: &str) -> Result<bool, anyhow::Error> {
631        let null_byte_count = text.bytes().filter(|&b| b == 0).count();
632
633        if null_byte_count > 0 {
634            return Ok(true);
635        }
636
637        let non_printable_ratio = text
638            .chars()
639            .filter(|&c| {
640                !c.is_ascii() && !c.is_ascii_graphic() && c != '\n' && c != '\r' && c != '\t'
641            })
642            .count() as f64
643            / text.len().max(1) as f64;
644
645        Ok(non_printable_ratio > 0.3)
646    }
647
648    /// Detect if text has very long lines (for minified JS/CSS).
649    ///
650    /// # Arguments
651    /// * `text` - The text to analyze
652    ///
653    /// # Returns
654    /// true if there are lines with many tokens, false otherwise
655    ///
656    /// Corresponds to Python: `typecode.get_type().is_text_with_long_lines` usage
657    fn detect_long_lines(text: &str) -> bool {
658        text.lines()
659            .any(|line| crate::license_detection::tokenize::count_tokens(line) > 25)
660    }
661
662    fn break_long_lines(lines: &[Vec<Option<KnownToken>>]) -> Vec<Vec<Option<KnownToken>>> {
663        lines
664            .iter()
665            .flat_map(|line| {
666                if line.is_empty() {
667                    return Vec::new();
668                }
669
670                if line.len() <= Self::MAX_TOKEN_PER_LINE {
671                    vec![line.clone()]
672                } else {
673                    line.chunks(Self::MAX_TOKEN_PER_LINE)
674                        .map(|chunk| chunk.to_vec())
675                        .collect()
676                }
677            })
678            .collect()
679    }
680
681    fn compute_query_runs(
682        tokens_by_line: &[Vec<Option<KnownToken>>],
683        line_threshold: usize,
684        has_long_lines: bool,
685    ) -> Vec<(usize, Option<usize>)> {
686        let processed_lines = if has_long_lines {
687            Self::break_long_lines(tokens_by_line)
688        } else {
689            tokens_by_line.to_vec()
690        };
691
692        let mut query_runs = Vec::new();
693        let mut query_run_start = 0usize;
694        let mut query_run_end = None;
695        let mut empty_lines = 0usize;
696        let mut pos = 0usize;
697        let mut query_run_is_all_digit = true;
698
699        for line_tokens in processed_lines {
700            if query_run_end.is_some() && empty_lines >= line_threshold {
701                if !query_run_is_all_digit {
702                    query_runs.push((query_run_start, query_run_end));
703                }
704                query_run_start = pos;
705                query_run_end = None;
706                empty_lines = 0;
707                query_run_is_all_digit = true;
708            }
709
710            if query_run_end.is_none() {
711                query_run_start = pos;
712            }
713
714            if line_tokens.is_empty() {
715                empty_lines += 1;
716                continue;
717            }
718
719            let line_is_all_digit = line_tokens
720                .iter()
721                .all(|token_id| token_id.map(|known| known.is_digit_only).unwrap_or(true));
722            let mut line_has_known_tokens = false;
723            let mut line_has_good_tokens = false;
724
725            for known in line_tokens.into_iter().flatten() {
726                line_has_known_tokens = true;
727                if known.kind == TokenKind::Legalese {
728                    line_has_good_tokens = true;
729                }
730                if !known.is_digit_only {
731                    query_run_is_all_digit = false;
732                }
733                query_run_end = Some(pos);
734                pos += 1;
735            }
736
737            if line_is_all_digit || !line_has_known_tokens {
738                empty_lines += 1;
739                continue;
740            }
741
742            if line_has_good_tokens {
743                empty_lines = 0;
744            } else {
745                empty_lines += 1;
746            }
747        }
748
749        if let Some(end) = query_run_end
750            && !query_run_is_all_digit
751        {
752            query_runs.push((query_run_start, Some(end)));
753        }
754
755        query_runs
756    }
757
758    /// Get the length of the query in tokens.
759    ///
760    /// Get the line number for a token position.
761    ///
762    /// # Arguments
763    /// * `pos` - The token position
764    ///
765    /// # Returns
766    /// The line number (1-based)
767    #[inline]
768    pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
769        self.line_by_pos.get(pos).copied()
770    }
771
772    /// Check if the query is empty (no known tokens).
773    #[inline]
774    pub fn is_empty(&self) -> bool {
775        self.tokens.is_empty()
776    }
777
778    /// Get a query run covering the entire query.
779    ///
780    /// Corresponds to Python: `whole_query_run()` method (lines 306-317)
781    pub fn whole_query_run(&self) -> QueryRun<'a> {
782        QueryRun::whole_query_snapshot(self)
783    }
784
785    /// Subtract matched span positions from matchables.
786    ///
787    /// This removes the positions from both high and low matchables.
788    ///
789    /// # Arguments
790    /// * `span` - The span of positions to subtract
791    ///
792    /// Corresponds to Python: `subtract()` method (lines 328-334)
793    pub fn subtract(&mut self, span: &PositionSpan) {
794        for pos in span.iter() {
795            self.high_matchables.remove(pos);
796            self.low_matchables.remove(pos);
797        }
798    }
799
800    /// Extract matched text for a given line range.
801    ///
802    /// Returns the text from the original input between start_line and end_line
803    /// (both inclusive, 1-indexed).
804    ///
805    /// # Arguments
806    /// * `start_line` - Starting line number (1-indexed)
807    /// * `end_line` - Ending line number (1-indexed)
808    ///
809    /// # Returns
810    /// The matched text, or empty string if lines are out of range
811    ///
812    /// Corresponds to Python: `matched_text()` method in match.py (lines 757-795)
813    pub fn matched_text(&self, start_line: usize, end_line: usize) -> String {
814        matched_text_from_text(&self.text, start_line, end_line)
815    }
816}
817
818#[derive(Debug, Clone)]
819struct WholeQueryRunSnapshot<'a> {
820    index: &'a LicenseIndex,
821    tokens: Vec<TokenId>,
822    line_by_pos: Vec<usize>,
823    high_matchables: BitSet,
824    low_matchables: BitSet,
825}
826
827/// A query run is a slice of query tokens identified by a start and end positions.
828///
829/// Query runs break a query into manageable chunks for efficient matching.
830/// They track matchable token positions and support subtraction of matched spans.
831///
832/// Based on Python QueryRun class at:
833/// reference/scancode-toolkit/src/licensedcode/query.py (lines 720-914)
834#[derive(Debug, Clone)]
835pub struct QueryRun<'a> {
836    query: Option<&'a Query<'a>>,
837    whole_query_snapshot: Option<WholeQueryRunSnapshot<'a>>,
838    pub start: usize,
839    pub end: Option<usize>,
840    cached_high_matchables: OnceCell<BitSet>,
841    cached_low_matchables: OnceCell<BitSet>,
842    combined_matchables: RefCell<Option<BitSet>>,
843}
844
845impl<'a> QueryRun<'a> {
846    /// Create a new query run from a query with start and end positions.
847    ///
848    /// # Arguments
849    /// * `query` - The parent query
850    /// * `start` - The start position (inclusive)
851    /// * `end` - The end position (inclusive), or None for an empty run
852    ///
853    /// Corresponds to Python: `QueryRun.__init__()` (lines 735-749)
854    pub fn new(query: &'a Query<'a>, start: usize, end: Option<usize>) -> Self {
855        Self {
856            query: Some(query),
857            whole_query_snapshot: None,
858            start,
859            end,
860            cached_high_matchables: OnceCell::new(),
861            cached_low_matchables: OnceCell::new(),
862            combined_matchables: RefCell::new(None),
863        }
864    }
865
866    fn whole_query_snapshot(query: &Query<'a>) -> Self {
867        let end = if query.is_empty() {
868            None
869        } else {
870            Some(query.tokens.len() - 1)
871        };
872
873        Self {
874            query: None,
875            whole_query_snapshot: Some(WholeQueryRunSnapshot {
876                index: query.index,
877                tokens: query.tokens.clone(),
878                line_by_pos: query.line_by_pos.clone(),
879                high_matchables: query.high_matchables.clone(),
880                low_matchables: query.low_matchables.clone(),
881            }),
882            start: 0,
883            end,
884            cached_high_matchables: OnceCell::new(),
885            cached_low_matchables: OnceCell::new(),
886            combined_matchables: RefCell::new(None),
887        }
888    }
889
890    fn source_tokens(&self) -> &[TokenId] {
891        if let Some(query) = self.query {
892            &query.tokens
893        } else {
894            &self
895                .whole_query_snapshot
896                .as_ref()
897                .expect("snapshot-backed whole query run should have snapshot data")
898                .tokens
899        }
900    }
901
902    fn source_line_by_pos(&self) -> &[usize] {
903        if let Some(query) = self.query {
904            &query.line_by_pos
905        } else {
906            &self
907                .whole_query_snapshot
908                .as_ref()
909                .expect("snapshot-backed whole query run should have snapshot data")
910                .line_by_pos
911        }
912    }
913
914    fn source_high_matchables(&self) -> &BitSet {
915        if let Some(query) = self.query {
916            &query.high_matchables
917        } else {
918            &self
919                .whole_query_snapshot
920                .as_ref()
921                .expect("snapshot-backed whole query run should have snapshot data")
922                .high_matchables
923        }
924    }
925
926    fn source_low_matchables(&self) -> &BitSet {
927        if let Some(query) = self.query {
928            &query.low_matchables
929        } else {
930            &self
931                .whole_query_snapshot
932                .as_ref()
933                .expect("snapshot-backed whole query run should have snapshot data")
934                .low_matchables
935        }
936    }
937
938    /// Get the license index used by this query run.
939    pub fn get_index(&self) -> &LicenseIndex {
940        if let Some(query) = self.query {
941            query.index
942        } else {
943            self.whole_query_snapshot
944                .as_ref()
945                .expect("snapshot-backed whole query run should have snapshot data")
946                .index
947        }
948    }
949
950    /// Get the line number for a specific token position.
951    ///
952    /// # Arguments
953    /// * `pos` - Absolute token position in the query
954    ///
955    /// # Returns
956    /// The line number (1-based), or None if position is out of range
957    pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
958        self.source_line_by_pos().get(pos).copied()
959    }
960
961    /// Get the sequence of token IDs for this run.
962    ///
963    /// Returns empty slice if end is None.
964    ///
965    /// Corresponds to Python: `tokens` property (lines 779-786)
966    pub fn tokens(&self) -> &[TokenId] {
967        match self.end {
968            Some(end) => &self.source_tokens()[self.start..=end],
969            None => &[],
970        }
971    }
972
973    /// Iterate over token IDs with their absolute positions.
974    ///
975    /// Corresponds to Python: `tokens_with_pos()` method (lines 788-789)
976    pub fn tokens_with_pos(&self) -> impl Iterator<Item = (usize, TokenId)> + '_ {
977        self.tokens()
978            .iter()
979            .copied()
980            .enumerate()
981            .map(|(i, tid)| (self.start + i, tid))
982    }
983
984    /// Check if this query run contains only digit tokens.
985    ///
986    /// Corresponds to Python: `is_digits_only()` method (lines 791-796)
987    pub fn is_digits_only(&self) -> bool {
988        self.tokens()
989            .iter()
990            .all(|&tid| self.get_index().dictionary.is_digit_only_token(tid))
991    }
992
993    /// Check if this query run has matchable tokens.
994    ///
995    /// # Arguments
996    /// * `include_low` - If true, include low-value tokens in the check
997    /// * `exclude_positions` - Optional set of spans containing positions to exclude
998    ///
999    /// Returns true if there are matchable tokens remaining
1000    ///
1001    /// Corresponds to Python: `is_matchable()` method (lines 798-818)
1002    pub fn is_matchable(&self, include_low: bool, exclude_positions: &[PositionSpan]) -> bool {
1003        if self.is_digits_only() {
1004            return false;
1005        }
1006
1007        let matchables = self.matchables(include_low);
1008
1009        if exclude_positions.is_empty() {
1010            return !matchables.is_empty();
1011        }
1012
1013        let mut matchable_set = matchables;
1014        for span in exclude_positions {
1015            for pos in span.iter() {
1016                matchable_set.remove(pos);
1017            }
1018        }
1019
1020        !matchable_set.is_empty()
1021    }
1022
1023    pub fn matchables(&self, include_low: bool) -> BitSet {
1024        if include_low {
1025            if let Some(ref cached) = *self.combined_matchables.borrow() {
1026                return cached.clone();
1027            }
1028            let combined: BitSet = self
1029                .low_matchables()
1030                .union(&self.high_matchables())
1031                .collect();
1032            *self.combined_matchables.borrow_mut() = Some(combined.clone());
1033            combined
1034        } else {
1035            self.high_matchables()
1036        }
1037    }
1038
1039    pub fn matchable_tokens(&self) -> Vec<i32> {
1040        let high_matchables = self.high_matchables();
1041        if high_matchables.is_empty() {
1042            return Vec::new();
1043        }
1044
1045        let matchables = self.matchables(true);
1046        self.tokens_with_pos()
1047            .map(|(pos, tid)| {
1048                if matchables.contains(pos) {
1049                    tid.raw() as i32
1050                } else {
1051                    -1
1052                }
1053            })
1054            .collect()
1055    }
1056
1057    pub fn high_matchables(&self) -> BitSet {
1058        self.cached_high_matchables
1059            .get_or_init(|| {
1060                let start = self.start;
1061                let end = self.end;
1062                let source = self.source_high_matchables();
1063                let live_span = PositionSpan::new(start, end.unwrap_or(usize::MAX));
1064                source
1065                    .iter()
1066                    .filter(|&pos| live_span.contains(pos))
1067                    .collect()
1068            })
1069            .clone()
1070    }
1071
1072    pub fn low_matchables(&self) -> BitSet {
1073        self.cached_low_matchables
1074            .get_or_init(|| {
1075                let start = self.start;
1076                let end = self.end;
1077                let source = self.source_low_matchables();
1078                let live_span = PositionSpan::new(start, end.unwrap_or(usize::MAX));
1079                source
1080                    .iter()
1081                    .filter(|&pos| live_span.contains(pos))
1082                    .collect()
1083            })
1084            .clone()
1085    }
1086}
1087
1088#[cfg(test)]
1089mod test;
provenant/license_detection/query/mod.rs

provenant/license_detection/query/
mod.rs