provenant/license_detection/query/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Query processing - tokenized input for license matching.
5
6use crate::license_detection::index::LicenseIndex;
7use crate::license_detection::index::dictionary::{KnownToken, QueryToken, TokenId, TokenKind};
8use crate::license_detection::models::PositionSpan;
9use crate::license_detection::position_set::PositionSet;
10use crate::license_detection::spdx_lid::split_spdx_lid;
11use crate::license_detection::tokenize::STOPWORDS;
12use crate::license_detection::tokenize::tokenize_as_ids;
13use regex::Regex;
14use std::cell::{OnceCell, RefCell};
15use std::collections::HashMap;
16use std::sync::LazyLock;
17use std::time::Instant;
18
19static QUERY_PATTERN: LazyLock<Regex> =
20    LazyLock::new(|| Regex::new(r"[^_\W]+\+?[^_\W]*").expect("valid query regex"));
21static MATCHED_TEXT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
22    Regex::new(r"(?P<token>[^_\W]+\+?[^_\W]*)|(?P<punct>[_\W\s\+]+[_\W\s]?)")
23        .expect("valid matched text regex")
24});
25
26#[derive(Clone)]
27struct MatchedTextToken {
28    value: String,
29    line_num: usize,
30    pos: Option<usize>,
31    is_text: bool,
32    is_matched: bool,
33}
34
35///
36/// Query holds:
37/// - Known token IDs (tokens existing in the index dictionary)
38/// - Token positions and their corresponding line numbers (line_by_pos)
39/// - Unknown tokens (tokens not in dictionary) tracked per position
40/// - Stopwords tracked per position
41/// - Positions with short/digit-only tokens
42/// - High and low matchable token positions (for tracking what's been matched)
43///
44/// Based on Python Query class at:
45/// reference/scancode-toolkit/src/licensedcode/query.py (lines 155-295)
46#[derive(Debug)]
47pub struct Query<'a> {
48    /// The original input text.
49    ///
50    /// Corresponds to Python: `self.query_string` (line 215)
51    pub text: String,
52
53    /// Token IDs for known tokens (tokens found in the index dictionary)
54    ///
55    /// Corresponds to Python: `self.tokens = []` (line 228)
56    pub tokens: Vec<TokenId>,
57
58    /// Mapping from token position to line number (1-based)
59    ///
60    /// Each token position in `self.tokens` maps to the line number where it appears.
61    /// This is used for match position reporting.
62    ///
63    /// Corresponds to Python: `self.line_by_pos = []` (line 231)
64    pub line_by_pos: Vec<usize>,
65
66    /// Mapping from token position to count of unknown tokens after that position
67    ///
68    /// Unknown tokens are those not found in the dictionary. We track them by
69    /// counting how many unknown tokens appear after each known position.
70    /// Unknown tokens before the first known token are tracked with the key `None`.
71    ///
72    /// Corresponds to Python: `self.unknowns_by_pos = {}` (line 236)
73    pub unknowns_by_pos: HashMap<Option<usize>, usize>,
74
75    /// Mapping from token position to count of stopwords after that position
76    ///
77    /// Similar to unknown_tokens, but for stopwords.
78    ///
79    /// Corresponds to Python: `self.stopwords_by_pos = {}` (line 244)
80    pub stopwords_by_pos: HashMap<Option<usize>, usize>,
81
82    /// Set of positions with single-character or digit-only tokens
83    ///
84    /// These tokens have special handling in matching.
85    ///
86    /// Corresponds to Python: `self.shorts_and_digits_pos = set()` (line 249)
87    pub shorts_and_digits_pos: PositionSet,
88
89    /// High-value matchable token positions (legalese tokens)
90    ///
91    /// These are tokens with ID < len_legalese.
92    ///
93    /// Corresponds to Python: `self.high_matchables` (line 293)
94    pub high_matchables: PositionSet,
95
96    /// Low-value matchable token positions (non-legalese tokens)
97    ///
98    /// These are tokens with ID >= len_legalese.
99    ///
100    /// Corresponds to Python: `self.low_matchables` (line 294)
101    pub low_matchables: PositionSet,
102
103    /// True if the query is detected as binary content
104    ///
105    /// Corresponds to Python: `self.is_binary = False` (line 225)
106    pub is_binary: bool,
107
108    /// Raw query run ranges (start, end) computed during tokenization.
109    ///
110    /// QueryRuns are created on-demand from these ranges.
111    ///
112    /// Corresponds to Python: `self.query_runs = []` (line 274)
113    pub(crate) query_run_ranges: Vec<(usize, Option<usize>)>,
114
115    /// SPDX-License-Identifier lines found during tokenization.
116    ///
117    /// Each tuple is (spdx_text, start_token_pos, end_token_pos).
118    /// Used for creating LicenseMatches with correct token positions.
119    ///
120    /// Corresponds to Python: `self.spdx_lines = []` (line 507)
121    pub spdx_lines: Vec<(String, usize, usize)>,
122
123    /// Reference to the license index for dictionary access and metadata
124    pub index: &'a LicenseIndex,
125}
126
127pub fn matched_text_from_text(text: &str, start_line: usize, end_line: usize) -> String {
128    if start_line == 0 || end_line == 0 || start_line > end_line {
129        return String::new();
130    }
131
132    text.lines()
133        .enumerate()
134        .filter_map(|(idx, line)| {
135            let line_num = idx + 1;
136            if line_num >= start_line && line_num <= end_line {
137                Some(line)
138            } else {
139                None
140            }
141        })
142        .collect::<Vec<_>>()
143        .join("\n")
144}
145
146pub fn matched_text_diagnostics_from_text(
147    text: &str,
148    query: &Query<'_>,
149    matched_positions: &PositionSet,
150    start_pos: usize,
151    end_pos: usize,
152    start_line: usize,
153    end_line: usize,
154) -> String {
155    let tokens = tokenize_matched_text(text, query);
156    let reportable_tokens = collect_reportable_tokens(
157        tokens,
158        matched_positions,
159        start_pos,
160        end_pos,
161        start_line,
162        end_line,
163    );
164    let line_endings = collect_line_endings(text);
165
166    render_diagnostic_tokens(&reportable_tokens, &line_endings)
167}
168
169/// Extracts matched text using token-span mode instead of whole-line mode.
170///
171/// This is used for files with very long lines (e.g., minified JS) where
172/// whole-line extraction would return megabytes of text for a small match.
173/// Instead, it returns only the tokens within the matched span, producing
174/// output similar to `matched_text_diagnostics_from_text()` but without
175/// the diagnostic `[bracket]` wrapping.
176///
177/// Falls back to `matched_text_from_text()` if token positions are unavailable.
178pub fn matched_text_from_tokens(
179    text: &str,
180    query: &Query<'_>,
181    matched_positions: &PositionSet,
182    start_pos: usize,
183    end_pos: usize,
184    start_line: usize,
185    end_line: usize,
186) -> String {
187    let tokens = tokenize_matched_text(text, query);
188    let reportable_tokens = collect_reportable_tokens(
189        tokens,
190        matched_positions,
191        start_pos,
192        end_pos,
193        start_line,
194        end_line,
195    );
196    let line_endings = collect_line_endings(text);
197
198    render_plain_tokens(&reportable_tokens, &line_endings)
199}
200
201fn render_plain_tokens(tokens: &[MatchedTextToken], line_endings: &[String]) -> String {
202    let mut rendered = String::new();
203    let mut previous_line: Option<usize> = None;
204
205    for token in tokens {
206        if let Some(prev_line) = previous_line
207            && token.line_num > prev_line
208        {
209            for line in prev_line..token.line_num {
210                if let Some(line_ending) = line_endings.get(line.saturating_sub(1)) {
211                    rendered.push_str(line_ending.as_str());
212                }
213            }
214        }
215
216        let token_value = if token.is_text {
217            token.value.as_str()
218        } else {
219            token
220                .value
221                .strip_suffix("\r\n")
222                .or_else(|| token.value.strip_suffix('\n'))
223                .unwrap_or(token.value.as_str())
224        };
225
226        rendered.push_str(token_value);
227
228        previous_line = Some(token.line_num);
229    }
230
231    rendered
232}
233
234fn tokenize_matched_text(text: &str, query: &Query<'_>) -> Vec<MatchedTextToken> {
235    let mut tokens = Vec::new();
236    let mut pos = 0usize;
237    for (line_num, line) in (1usize..).zip(text.split_inclusive('\n')) {
238        for capture in MATCHED_TEXT_PATTERN.captures_iter(line) {
239            if let Some(token_match) = capture.name("token") {
240                let token_text = token_match.as_str();
241                let retokenized: Vec<String> = QUERY_PATTERN
242                    .find_iter(&token_text.to_lowercase())
243                    .map(|m| m.as_str().to_string())
244                    .filter(|token| !STOPWORDS.contains(token.as_str()))
245                    .collect();
246
247                if retokenized.is_empty() {
248                    tokens.push(MatchedTextToken {
249                        value: token_text.to_string(),
250                        line_num,
251                        pos: None,
252                        is_text: true,
253                        is_matched: false,
254                    });
255                } else if retokenized.len() == 1 {
256                    let token = &retokenized[0];
257                    let token_pos = if query.index.dictionary.get(token).is_some() {
258                        let current_pos = pos;
259                        pos += 1;
260                        Some(current_pos)
261                    } else {
262                        None
263                    };
264
265                    tokens.push(MatchedTextToken {
266                        value: token_text.to_string(),
267                        line_num,
268                        pos: token_pos,
269                        is_text: true,
270                        is_matched: false,
271                    });
272                } else {
273                    for token in retokenized {
274                        let token_pos = if query.index.dictionary.get(&token).is_some() {
275                            let current_pos = pos;
276                            pos += 1;
277                            Some(current_pos)
278                        } else {
279                            None
280                        };
281
282                        tokens.push(MatchedTextToken {
283                            value: token,
284                            line_num,
285                            pos: token_pos,
286                            is_text: true,
287                            is_matched: false,
288                        });
289                    }
290                }
291            } else if let Some(punct_match) = capture.name("punct") {
292                tokens.push(MatchedTextToken {
293                    value: punct_match.as_str().to_string(),
294                    line_num,
295                    pos: None,
296                    is_text: false,
297                    is_matched: false,
298                });
299            }
300        }
301    }
302
303    tokens
304}
305
306fn collect_reportable_tokens(
307    tokens: Vec<MatchedTextToken>,
308    matched_positions: &PositionSet,
309    start_pos: usize,
310    end_pos: usize,
311    start_line: usize,
312    end_line: usize,
313) -> Vec<MatchedTextToken> {
314    let mut reportable = Vec::new();
315    let mut started = false;
316    let mut finished = false;
317    let mut end_real_pos = None;
318    let mut last_real_pos = None;
319
320    for (real_pos, mut token) in tokens.into_iter().enumerate() {
321        if token.line_num < start_line {
322            continue;
323        }
324
325        if token.line_num > end_line {
326            break;
327        }
328
329        let mut is_included = false;
330
331        if token.pos.is_some_and(|pos| matched_positions.contains(pos)) {
332            token.is_matched = true;
333            is_included = true;
334        }
335
336        if !started && token.pos == Some(start_pos) {
337            started = true;
338            is_included = true;
339        }
340
341        if started && !finished {
342            is_included = true;
343        }
344
345        if token.pos == Some(end_pos) {
346            finished = true;
347            started = false;
348            end_real_pos = Some(real_pos);
349        }
350
351        if finished && !started && end_real_pos.is_some() && last_real_pos == end_real_pos {
352            end_real_pos = None;
353            if !token.is_text && !token.value.trim().is_empty() {
354                is_included = true;
355            }
356        }
357
358        last_real_pos = Some(real_pos);
359
360        if is_included {
361            reportable.push(token);
362        }
363    }
364
365    reportable
366}
367
368fn collect_line_endings(text: &str) -> Vec<String> {
369    text.split_inclusive('\n')
370        .map(|line| {
371            if line.ends_with("\r\n") {
372                "\r\n".to_string()
373            } else if line.ends_with('\n') {
374                "\n".to_string()
375            } else {
376                String::new()
377            }
378        })
379        .collect()
380}
381
382fn render_diagnostic_tokens(tokens: &[MatchedTextToken], line_endings: &[String]) -> String {
383    let mut rendered = String::new();
384    let mut previous_line: Option<usize> = None;
385
386    for token in tokens {
387        if let Some(prev_line) = previous_line
388            && token.line_num > prev_line
389        {
390            for line in prev_line..token.line_num {
391                if let Some(line_ending) = line_endings.get(line.saturating_sub(1)) {
392                    rendered.push_str(line_ending.as_str());
393                }
394            }
395        }
396
397        let token_value = if token.is_text {
398            token.value.as_str()
399        } else {
400            token
401                .value
402                .strip_suffix("\r\n")
403                .or_else(|| token.value.strip_suffix('\n'))
404                .unwrap_or(token.value.as_str())
405        };
406
407        if token.is_text && !STOPWORDS.contains(token.value.to_lowercase().as_str()) {
408            if token.is_matched {
409                rendered.push_str(token_value);
410            } else {
411                rendered.push('[');
412                rendered.push_str(token_value);
413                rendered.push(']');
414            }
415        } else {
416            rendered.push_str(token_value);
417        }
418
419        previous_line = Some(token.line_num);
420    }
421
422    rendered
423}
424
425impl<'a> Query<'a> {
426    /// Create a new query from text string and license index.
427    ///
428    /// This tokenizes the input text, looks up each token in the index dictionary,
429    /// and builds the query structures for matching.
430    ///
431    /// # Arguments
432    /// * `text` - The input text to tokenize
433    /// * `index` - The license index containing the token dictionary
434    ///
435    /// # Returns
436    /// A Result containing the Query or an error if binary detection fails
437    ///
438    /// Detection scans file-like text, so this uses Python's
439    /// `build_query(..., text_line_threshold=15)` threshold.
440    const TEXT_LINE_THRESHOLD: usize = 15;
441    const BINARY_LINE_THRESHOLD: usize = 50;
442    const MAX_TOKEN_PER_LINE: usize = 25;
443
444    fn compute_spdx_offset(
445        tokens: &[QueryToken],
446        dictionary: &crate::license_detection::index::dictionary::TokenDictionary,
447    ) -> Option<usize> {
448        let get_known_id = |i: usize| -> Option<TokenId> {
449            match tokens.get(i)? {
450                QueryToken::Known(known) => Some(known.id),
451                _ => None,
452            }
453        };
454
455        let spdx_id = dictionary.get("spdx")?;
456        let license_id = dictionary.get("license")?;
457        let identifier_id = dictionary.get("identifier")?;
458        let licence_id = dictionary.get("licence");
459
460        let licenses_id = dictionary.get("licenses");
461        let nuget_id = dictionary.get("nuget");
462        let org_id = dictionary.get("org");
463
464        let is_spdx_prefix = |ids: [Option<TokenId>; 3]| -> bool {
465            ids.iter().all(|id| id.is_some())
466                && ids[0] == Some(spdx_id)
467                && (ids[1] == Some(license_id) || ids[1] == licence_id)
468                && ids[2] == Some(identifier_id)
469        };
470
471        let is_nuget_prefix = |ids: [Option<TokenId>; 3]| -> bool {
472            licenses_id.is_some()
473                && nuget_id.is_some()
474                && org_id.is_some()
475                && ids[0] == licenses_id
476                && ids[1] == Some(nuget_id.unwrap())
477                && ids[2] == Some(org_id.unwrap())
478        };
479
480        if tokens.len() >= 3 {
481            let first_three = [get_known_id(0), get_known_id(1), get_known_id(2)];
482            if is_spdx_prefix(first_three) || is_nuget_prefix(first_three) {
483                return Some(0);
484            }
485        }
486
487        if tokens.len() >= 4 {
488            let second_three = [get_known_id(1), get_known_id(2), get_known_id(3)];
489            if is_spdx_prefix(second_three) || is_nuget_prefix(second_three) {
490                return Some(1);
491            }
492        }
493
494        if tokens.len() >= 5 {
495            let third_three = [get_known_id(2), get_known_id(3), get_known_id(4)];
496            if is_spdx_prefix(third_three) || is_nuget_prefix(third_three) {
497                return Some(2);
498            }
499        }
500
501        None
502    }
503
504    pub fn from_extracted_text(
505        text: &str,
506        index: &'a LicenseIndex,
507        binary_derived: bool,
508    ) -> Result<Self, anyhow::Error> {
509        Self::from_extracted_text_with_deadline(text, index, binary_derived, None)
510    }
511
512    pub fn from_extracted_text_with_deadline(
513        text: &str,
514        index: &'a LicenseIndex,
515        binary_derived: bool,
516        deadline: Option<Instant>,
517    ) -> Result<Self, anyhow::Error> {
518        let line_threshold = if binary_derived {
519            Self::BINARY_LINE_THRESHOLD
520        } else {
521            Self::TEXT_LINE_THRESHOLD
522        };
523
524        Self::with_source_options(text, index, line_threshold, Some(binary_derived), deadline)
525    }
526
527    /// Iterate over query runs.
528    ///
529    /// Corresponds to Python: `query.query_runs` property iteration
530    pub fn query_runs(&self) -> Vec<QueryRun<'_>> {
531        self.query_run_ranges
532            .iter()
533            .map(|&(start, end)| QueryRun::new(self, start, end))
534            .collect()
535    }
536
537    fn with_source_options(
538        text: &str,
539        index: &'a LicenseIndex,
540        line_threshold: usize,
541        binary_derived: Option<bool>,
542        deadline: Option<Instant>,
543    ) -> Result<Self, anyhow::Error> {
544        crate::license_detection::ensure_within_deadline(deadline)?;
545        let is_binary = match binary_derived {
546            Some(is_binary) => is_binary,
547            None => Self::detect_binary(text)?,
548        };
549        let has_long_lines = Self::detect_long_lines(text);
550
551        let mut tokens = Vec::new();
552        let mut line_by_pos = Vec::new();
553        let mut unknowns_by_pos: HashMap<Option<usize>, usize> = HashMap::new();
554        let mut stopwords_by_pos: HashMap<Option<usize>, usize> = HashMap::new();
555        let mut shorts_and_digits_pos = PositionSet::new();
556        let mut spdx_lines: Vec<(String, usize, usize)> = Vec::new();
557
558        let mut known_pos: Option<usize> = None;
559        let mut started = false;
560        let mut tokens_by_line: Vec<Vec<Option<KnownToken>>> = Vec::new();
561
562        for (current_line, (line_index, line)) in (1usize..).zip(text.lines().enumerate()) {
563            if line_index.is_multiple_of(128) {
564                crate::license_detection::ensure_within_deadline(deadline)?;
565            }
566
567            let line_trimmed = line.trim();
568            let mut line_tokens: Vec<Option<KnownToken>> = Vec::new();
569
570            let mut line_first_known_pos = None;
571
572            let line_query_tokens = tokenize_as_ids(line_trimmed, &index.dictionary);
573
574            for query_token in &line_query_tokens {
575                match query_token {
576                    QueryToken::Known(known_token) => {
577                        known_pos = Some(known_pos.map_or(0, |p| p + 1));
578                        started = true;
579                        tokens.push(known_token.id);
580                        line_by_pos.push(current_line);
581                        line_tokens.push(Some(*known_token));
582
583                        if line_first_known_pos.is_none() {
584                            line_first_known_pos = known_pos;
585                        }
586
587                        if known_token.is_short_or_digit {
588                            let _ = shorts_and_digits_pos.insert(known_pos.unwrap());
589                        }
590                    }
591                    QueryToken::Unknown if !started => {
592                        *unknowns_by_pos.entry(None).or_insert(0) += 1;
593                        line_tokens.push(None);
594                    }
595                    QueryToken::Unknown => {
596                        *unknowns_by_pos.entry(known_pos).or_insert(0) += 1;
597                        line_tokens.push(None);
598                    }
599                    QueryToken::Stopword if !started => {
600                        *stopwords_by_pos.entry(None).or_insert(0) += 1;
601                    }
602                    QueryToken::Stopword => {
603                        *stopwords_by_pos.entry(known_pos).or_insert(0) += 1;
604                    }
605                }
606            }
607
608            let line_last_known_pos = known_pos;
609
610            let spdx_start_offset =
611                Self::compute_spdx_offset(&line_query_tokens, &index.dictionary);
612
613            if let Some(offset) = spdx_start_offset
614                && let Some(line_first_known_pos) = line_first_known_pos
615            {
616                let (spdx_prefix, spdx_expression) = split_spdx_lid(line);
617                let spdx_text = format!("{}{}", spdx_prefix.unwrap_or_default(), spdx_expression);
618                let spdx_start_known_pos = line_first_known_pos + offset;
619
620                if spdx_start_known_pos <= line_last_known_pos.unwrap() {
621                    let spdx_end = line_last_known_pos.unwrap() + 1;
622                    spdx_lines.push((spdx_text, spdx_start_known_pos, spdx_end));
623                }
624            }
625            tokens_by_line.push(line_tokens);
626        }
627
628        crate::license_detection::ensure_within_deadline(deadline)?;
629
630        let high_matchables: PositionSet = tokens
631            .iter()
632            .enumerate()
633            .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Legalese)
634            .map(|(pos, _tid)| pos)
635            .collect();
636
637        let low_matchables: PositionSet = tokens
638            .iter()
639            .enumerate()
640            .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Regular)
641            .map(|(pos, _tid)| pos)
642            .collect();
643
644        let query_runs = Self::compute_query_runs(&tokens_by_line, line_threshold, has_long_lines);
645
646        Ok(Query {
647            text: text.to_string(),
648            tokens,
649            line_by_pos,
650            unknowns_by_pos,
651            stopwords_by_pos,
652            shorts_and_digits_pos,
653            high_matchables,
654            low_matchables,
655            is_binary,
656            query_run_ranges: query_runs,
657            spdx_lines,
658            index,
659        })
660    }
661
662    /// Detect if text is binary content.
663    ///
664    /// Binary detection checks for:
665    /// - Null bytes (0x00)
666    /// - High ratio of non-printable characters
667    ///
668    /// # Arguments
669    /// * `text` - The text to analyze
670    ///
671    /// # Returns
672    /// true if binary, false otherwise
673    ///
674    /// Corresponds to Python: `typecode.get_type().is_binary` usage (lines 123-135)
675    fn detect_binary(text: &str) -> Result<bool, anyhow::Error> {
676        let null_byte_count = text.bytes().filter(|&b| b == 0).count();
677
678        if null_byte_count > 0 {
679            return Ok(true);
680        }
681
682        let non_printable_ratio = text
683            .chars()
684            .filter(|&c| {
685                !c.is_ascii() && !c.is_ascii_graphic() && c != '\n' && c != '\r' && c != '\t'
686            })
687            .count() as f64
688            / text.len().max(1) as f64;
689
690        Ok(non_printable_ratio > 0.3)
691    }
692
693    /// Detect if text has very long lines (for minified JS/CSS).
694    ///
695    /// # Arguments
696    /// * `text` - The text to analyze
697    ///
698    /// # Returns
699    /// true if there are lines with many tokens, false otherwise
700    ///
701    /// Corresponds to Python: `typecode.get_type().is_text_with_long_lines` usage
702    fn detect_long_lines(text: &str) -> bool {
703        text.lines()
704            .any(|line| crate::license_detection::tokenize::count_tokens(line) > 25)
705    }
706
707    fn break_long_lines(lines: &[Vec<Option<KnownToken>>]) -> Vec<Vec<Option<KnownToken>>> {
708        lines
709            .iter()
710            .flat_map(|line| {
711                if line.is_empty() {
712                    return Vec::new();
713                }
714
715                if line.len() <= Self::MAX_TOKEN_PER_LINE {
716                    vec![line.clone()]
717                } else {
718                    line.chunks(Self::MAX_TOKEN_PER_LINE)
719                        .map(|chunk| chunk.to_vec())
720                        .collect()
721                }
722            })
723            .collect()
724    }
725
726    fn compute_query_runs(
727        tokens_by_line: &[Vec<Option<KnownToken>>],
728        line_threshold: usize,
729        has_long_lines: bool,
730    ) -> Vec<(usize, Option<usize>)> {
731        let processed_lines = if has_long_lines {
732            Self::break_long_lines(tokens_by_line)
733        } else {
734            tokens_by_line.to_vec()
735        };
736
737        let mut query_runs = Vec::new();
738        let mut query_run_start = 0usize;
739        let mut query_run_end = None;
740        let mut empty_lines = 0usize;
741        let mut pos = 0usize;
742        let mut query_run_is_all_digit = true;
743
744        for line_tokens in processed_lines {
745            if query_run_end.is_some() && empty_lines >= line_threshold {
746                if !query_run_is_all_digit {
747                    query_runs.push((query_run_start, query_run_end));
748                }
749                query_run_start = pos;
750                query_run_end = None;
751                empty_lines = 0;
752                query_run_is_all_digit = true;
753            }
754
755            if query_run_end.is_none() {
756                query_run_start = pos;
757            }
758
759            if line_tokens.is_empty() {
760                empty_lines += 1;
761                continue;
762            }
763
764            let line_is_all_digit = line_tokens
765                .iter()
766                .all(|token_id| token_id.map(|known| known.is_digit_only).unwrap_or(true));
767            let mut line_has_known_tokens = false;
768            let mut line_has_good_tokens = false;
769
770            for known in line_tokens.into_iter().flatten() {
771                line_has_known_tokens = true;
772                if known.kind == TokenKind::Legalese {
773                    line_has_good_tokens = true;
774                }
775                if !known.is_digit_only {
776                    query_run_is_all_digit = false;
777                }
778                query_run_end = Some(pos);
779                pos += 1;
780            }
781
782            if line_is_all_digit || !line_has_known_tokens {
783                empty_lines += 1;
784                continue;
785            }
786
787            if line_has_good_tokens {
788                empty_lines = 0;
789            } else {
790                empty_lines += 1;
791            }
792        }
793
794        if let Some(end) = query_run_end
795            && !query_run_is_all_digit
796        {
797            query_runs.push((query_run_start, Some(end)));
798        }
799
800        query_runs
801    }
802
803    /// Get the length of the query in tokens.
804    ///
805    /// Get the line number for a token position.
806    ///
807    /// # Arguments
808    /// * `pos` - The token position
809    ///
810    /// # Returns
811    /// The line number (1-based)
812    #[inline]
813    pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
814        self.line_by_pos.get(pos).copied()
815    }
816
817    /// Check if the query is empty (no known tokens).
818    #[inline]
819    pub fn is_empty(&self) -> bool {
820        self.tokens.is_empty()
821    }
822
823    /// Get a query run covering the entire query.
824    ///
825    /// Corresponds to Python: `whole_query_run()` method (lines 306-317)
826    pub fn whole_query_run(&self) -> QueryRun<'a> {
827        QueryRun::whole_query_snapshot(self)
828    }
829
830    /// Subtract matched span positions from matchables.
831    ///
832    /// This removes the positions from both high and low matchables.
833    ///
834    /// # Arguments
835    /// * `span` - The span of positions to subtract
836    ///
837    /// Corresponds to Python: `subtract()` method (lines 328-334)
838    pub fn subtract(&mut self, span: &PositionSpan) {
839        self.high_matchables.remove_span(span);
840        self.low_matchables.remove_span(span);
841    }
842
843    /// Extract matched text for a given line range.
844    ///
845    /// Returns the text from the original input between start_line and end_line
846    /// (both inclusive, 1-indexed).
847    ///
848    /// # Arguments
849    /// * `start_line` - Starting line number (1-indexed)
850    /// * `end_line` - Ending line number (1-indexed)
851    ///
852    /// # Returns
853    /// The matched text, or empty string if lines are out of range
854    ///
855    /// Corresponds to Python: `matched_text()` method in match.py (lines 757-795)
856    pub fn matched_text(&self, start_line: usize, end_line: usize) -> String {
857        matched_text_from_text(&self.text, start_line, end_line)
858    }
859}
860
861#[derive(Debug, Clone)]
862struct WholeQueryRunSnapshot<'a> {
863    index: &'a LicenseIndex,
864    tokens: Vec<TokenId>,
865    line_by_pos: Vec<usize>,
866    high_matchables: PositionSet,
867    low_matchables: PositionSet,
868}
869
870/// A query run is a slice of query tokens identified by a start and end positions.
871///
872/// Query runs break a query into manageable chunks for efficient matching.
873/// They track matchable token positions and support subtraction of matched spans.
874///
875/// Based on Python QueryRun class at:
876/// reference/scancode-toolkit/src/licensedcode/query.py (lines 720-914)
877#[derive(Debug, Clone)]
878pub struct QueryRun<'a> {
879    query: Option<&'a Query<'a>>,
880    whole_query_snapshot: Option<WholeQueryRunSnapshot<'a>>,
881    pub start: usize,
882    pub end: Option<usize>,
883    cached_high_matchables: OnceCell<PositionSet>,
884    cached_low_matchables: OnceCell<PositionSet>,
885    combined_matchables: RefCell<Option<PositionSet>>,
886}
887
888impl<'a> QueryRun<'a> {
889    /// Create a new query run from a query with start and end positions.
890    ///
891    /// # Arguments
892    /// * `query` - The parent query
893    /// * `start` - The start position (inclusive)
894    /// * `end` - The end position (inclusive), or None for an empty run
895    ///
896    /// Corresponds to Python: `QueryRun.__init__()` (lines 735-749)
897    pub fn new(query: &'a Query<'a>, start: usize, end: Option<usize>) -> Self {
898        Self {
899            query: Some(query),
900            whole_query_snapshot: None,
901            start,
902            end,
903            cached_high_matchables: OnceCell::new(),
904            cached_low_matchables: OnceCell::new(),
905            combined_matchables: RefCell::new(None),
906        }
907    }
908
909    fn whole_query_snapshot(query: &Query<'a>) -> Self {
910        let end = if query.is_empty() {
911            None
912        } else {
913            Some(query.tokens.len() - 1)
914        };
915
916        Self {
917            query: None,
918            whole_query_snapshot: Some(WholeQueryRunSnapshot {
919                index: query.index,
920                tokens: query.tokens.clone(),
921                line_by_pos: query.line_by_pos.clone(),
922                high_matchables: query.high_matchables.clone(),
923                low_matchables: query.low_matchables.clone(),
924            }),
925            start: 0,
926            end,
927            cached_high_matchables: OnceCell::new(),
928            cached_low_matchables: OnceCell::new(),
929            combined_matchables: RefCell::new(None),
930        }
931    }
932
933    fn source_tokens(&self) -> &[TokenId] {
934        if let Some(query) = self.query {
935            &query.tokens
936        } else {
937            &self
938                .whole_query_snapshot
939                .as_ref()
940                .expect("snapshot-backed whole query run should have snapshot data")
941                .tokens
942        }
943    }
944
945    fn source_line_by_pos(&self) -> &[usize] {
946        if let Some(query) = self.query {
947            &query.line_by_pos
948        } else {
949            &self
950                .whole_query_snapshot
951                .as_ref()
952                .expect("snapshot-backed whole query run should have snapshot data")
953                .line_by_pos
954        }
955    }
956
957    fn source_high_matchables(&self) -> &PositionSet {
958        if let Some(query) = self.query {
959            &query.high_matchables
960        } else {
961            &self
962                .whole_query_snapshot
963                .as_ref()
964                .expect("snapshot-backed whole query run should have snapshot data")
965                .high_matchables
966        }
967    }
968
969    fn source_low_matchables(&self) -> &PositionSet {
970        if let Some(query) = self.query {
971            &query.low_matchables
972        } else {
973            &self
974                .whole_query_snapshot
975                .as_ref()
976                .expect("snapshot-backed whole query run should have snapshot data")
977                .low_matchables
978        }
979    }
980
981    /// Get the license index used by this query run.
982    pub fn get_index(&self) -> &LicenseIndex {
983        if let Some(query) = self.query {
984            query.index
985        } else {
986            self.whole_query_snapshot
987                .as_ref()
988                .expect("snapshot-backed whole query run should have snapshot data")
989                .index
990        }
991    }
992
993    /// Get the line number for a specific token position.
994    ///
995    /// # Arguments
996    /// * `pos` - Absolute token position in the query
997    ///
998    /// # Returns
999    /// The line number (1-based), or None if position is out of range
1000    pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
1001        self.source_line_by_pos().get(pos).copied()
1002    }
1003
1004    /// Get the sequence of token IDs for this run.
1005    ///
1006    /// Returns empty slice if end is None.
1007    ///
1008    /// Corresponds to Python: `tokens` property (lines 779-786)
1009    pub fn tokens(&self) -> &[TokenId] {
1010        match self.end {
1011            Some(end) => &self.source_tokens()[self.start..=end],
1012            None => &[],
1013        }
1014    }
1015
1016    /// Iterate over token IDs with their absolute positions.
1017    ///
1018    /// Corresponds to Python: `tokens_with_pos()` method (lines 788-789)
1019    pub fn tokens_with_pos(&self) -> impl Iterator<Item = (usize, TokenId)> + '_ {
1020        self.tokens()
1021            .iter()
1022            .copied()
1023            .enumerate()
1024            .map(|(i, tid)| (self.start + i, tid))
1025    }
1026
1027    /// Check if this query run contains only digit tokens.
1028    ///
1029    /// Corresponds to Python: `is_digits_only()` method (lines 791-796)
1030    pub fn is_digits_only(&self) -> bool {
1031        self.tokens()
1032            .iter()
1033            .all(|&tid| self.get_index().dictionary.is_digit_only_token(tid))
1034    }
1035
1036    /// Check if this query run has matchable tokens.
1037    ///
1038    /// # Arguments
1039    /// * `include_low` - If true, include low-value tokens in the check
1040    /// * `exclude_positions` - Optional set of spans containing positions to exclude
1041    ///
1042    /// Returns true if there are matchable tokens remaining
1043    ///
1044    /// Corresponds to Python: `is_matchable()` method (lines 798-818)
1045    pub fn is_matchable(&self, include_low: bool, exclude_positions: &[PositionSpan]) -> bool {
1046        if self.is_digits_only() {
1047            return false;
1048        }
1049
1050        let matchables = self.matchables(include_low);
1051
1052        if exclude_positions.is_empty() {
1053            return !matchables.is_empty();
1054        }
1055
1056        let mut matchable_set = matchables;
1057        for span in exclude_positions {
1058            matchable_set.remove_span(span);
1059        }
1060
1061        !matchable_set.is_empty()
1062    }
1063
1064    pub fn matchables(&self, include_low: bool) -> PositionSet {
1065        if include_low {
1066            if let Some(ref cached) = *self.combined_matchables.borrow() {
1067                return cached.clone();
1068            }
1069            let combined = self.low_matchables().union(&self.high_matchables());
1070            *self.combined_matchables.borrow_mut() = Some(combined.clone());
1071            combined
1072        } else {
1073            self.high_matchables()
1074        }
1075    }
1076
1077    pub fn matchable_tokens(&self) -> Vec<Option<TokenId>> {
1078        let high_matchables = self.high_matchables();
1079        if high_matchables.is_empty() {
1080            return Vec::new();
1081        }
1082
1083        let matchables = self.matchables(true);
1084        self.tokens_with_pos()
1085            .map(|(pos, tid)| {
1086                if matchables.contains(pos) {
1087                    Some(tid)
1088                } else {
1089                    None
1090                }
1091            })
1092            .collect()
1093    }
1094
1095    pub fn high_matchables(&self) -> PositionSet {
1096        self.cached_high_matchables
1097            .get_or_init(|| {
1098                let start = self.start;
1099                let end = self.end.map(|e| e + 1).unwrap_or(usize::MAX);
1100                let source = self.source_high_matchables();
1101                let live_span = PositionSpan::new(start, end);
1102                source
1103                    .iter()
1104                    .filter(|&pos| live_span.contains(pos))
1105                    .collect()
1106            })
1107            .clone()
1108    }
1109
1110    pub fn low_matchables(&self) -> PositionSet {
1111        self.cached_low_matchables
1112            .get_or_init(|| {
1113                let start = self.start;
1114                let end = self.end.map(|e| e + 1).unwrap_or(usize::MAX);
1115                let source = self.source_low_matchables();
1116                let live_span = PositionSpan::new(start, end);
1117                source
1118                    .iter()
1119                    .filter(|&pos| live_span.contains(pos))
1120                    .collect()
1121            })
1122            .clone()
1123    }
1124}
1125
1126#[cfg(test)]
1127mod test;
provenant/license_detection/query/mod.rs

provenant/license_detection/query/
mod.rs