provenant/license_detection/query/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Query processing - tokenized input for license matching.
5
6use crate::license_detection::LicenseDetectionError;
7use crate::license_detection::index::LicenseIndex;
8use crate::license_detection::index::dictionary::{KnownToken, QueryToken, TokenId, TokenKind};
9use crate::license_detection::models::PositionSpan;
10use crate::license_detection::position_set::PositionSet;
11use crate::license_detection::spdx_lid::split_spdx_lid;
12use crate::license_detection::tokenize::STOPWORDS;
13use crate::license_detection::tokenize::tokenize_as_ids;
14use regex::Regex;
15use std::cell::{OnceCell, RefCell};
16use std::collections::HashMap;
17use std::sync::LazyLock;
18use std::time::Instant;
19
20static QUERY_PATTERN: LazyLock<Regex> =
21    LazyLock::new(|| Regex::new(r"[^_\W]+\+?[^_\W]*").expect("valid query regex"));
22static MATCHED_TEXT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
23    Regex::new(r"(?P<token>[^_\W]+\+?[^_\W]*)|(?P<punct>[_\W\s\+]+[_\W\s]?)")
24        .expect("valid matched text regex")
25});
26
27#[derive(Clone)]
28struct MatchedTextToken {
29    value: String,
30    line_num: usize,
31    pos: Option<usize>,
32    is_text: bool,
33    is_matched: bool,
34}
35
36///
37/// Query holds:
38/// - Known token IDs (tokens existing in the index dictionary)
39/// - Token positions and their corresponding line numbers (line_by_pos)
40/// - Unknown tokens (tokens not in dictionary) tracked per position
41/// - Stopwords tracked per position
42/// - Positions with short/digit-only tokens
43/// - High and low matchable token positions (for tracking what's been matched)
44///
45/// Based on Python Query class at:
46/// reference/scancode-toolkit/src/licensedcode/query.py (lines 155-295)
47#[derive(Debug)]
48pub struct Query<'a> {
49    /// The original input text.
50    ///
51    /// Corresponds to Python: `self.query_string` (line 215)
52    pub text: String,
53
54    /// Token IDs for known tokens (tokens found in the index dictionary)
55    ///
56    /// Corresponds to Python: `self.tokens = []` (line 228)
57    pub tokens: Vec<TokenId>,
58
59    /// Mapping from token position to line number (1-based)
60    ///
61    /// Each token position in `self.tokens` maps to the line number where it appears.
62    /// This is used for match position reporting.
63    ///
64    /// Corresponds to Python: `self.line_by_pos = []` (line 231)
65    pub line_by_pos: Vec<usize>,
66
67    /// Mapping from token position to count of unknown tokens after that position
68    ///
69    /// Unknown tokens are those not found in the dictionary. We track them by
70    /// counting how many unknown tokens appear after each known position.
71    /// Unknown tokens before the first known token are tracked with the key `None`.
72    ///
73    /// Corresponds to Python: `self.unknowns_by_pos = {}` (line 236)
74    pub unknowns_by_pos: HashMap<Option<usize>, usize>,
75
76    /// Mapping from token position to count of stopwords after that position
77    ///
78    /// Similar to unknown_tokens, but for stopwords.
79    ///
80    /// Corresponds to Python: `self.stopwords_by_pos = {}` (line 244)
81    pub stopwords_by_pos: HashMap<Option<usize>, usize>,
82
83    /// Set of positions with single-character or digit-only tokens
84    ///
85    /// These tokens have special handling in matching.
86    ///
87    /// Corresponds to Python: `self.shorts_and_digits_pos = set()` (line 249)
88    pub shorts_and_digits_pos: PositionSet,
89
90    /// High-value matchable token positions (legalese tokens)
91    ///
92    /// These are tokens with ID < len_legalese.
93    ///
94    /// Corresponds to Python: `self.high_matchables` (line 293)
95    pub high_matchables: PositionSet,
96
97    /// Low-value matchable token positions (non-legalese tokens)
98    ///
99    /// These are tokens with ID >= len_legalese.
100    ///
101    /// Corresponds to Python: `self.low_matchables` (line 294)
102    pub low_matchables: PositionSet,
103
104    /// True if the query is detected as binary content
105    ///
106    /// Corresponds to Python: `self.is_binary = False` (line 225)
107    pub is_binary: bool,
108
109    /// Raw query run ranges (start, end) computed during tokenization.
110    ///
111    /// QueryRuns are created on-demand from these ranges.
112    ///
113    /// Corresponds to Python: `self.query_runs = []` (line 274)
114    pub(crate) query_run_ranges: Vec<(usize, Option<usize>)>,
115
116    /// SPDX-License-Identifier lines found during tokenization.
117    ///
118    /// Each tuple is (spdx_text, start_token_pos, end_token_pos).
119    /// Used for creating LicenseMatches with correct token positions.
120    ///
121    /// Corresponds to Python: `self.spdx_lines = []` (line 507)
122    pub spdx_lines: Vec<(String, usize, usize)>,
123
124    /// Reference to the license index for dictionary access and metadata
125    pub index: &'a LicenseIndex,
126}
127
128pub fn matched_text_from_text(text: &str, start_line: usize, end_line: usize) -> String {
129    if start_line == 0 || end_line == 0 || start_line > end_line {
130        return String::new();
131    }
132
133    text.lines()
134        .enumerate()
135        .filter_map(|(idx, line)| {
136            let line_num = idx + 1;
137            if line_num >= start_line && line_num <= end_line {
138                Some(line)
139            } else {
140                None
141            }
142        })
143        .collect::<Vec<_>>()
144        .join("\n")
145}
146
147pub fn matched_text_diagnostics_from_text(
148    text: &str,
149    query: &Query<'_>,
150    matched_positions: &PositionSet,
151    start_pos: usize,
152    end_pos: usize,
153    start_line: usize,
154    end_line: usize,
155) -> String {
156    let tokens = tokenize_matched_text(text, query);
157    let reportable_tokens = collect_reportable_tokens(
158        tokens,
159        matched_positions,
160        start_pos,
161        end_pos,
162        start_line,
163        end_line,
164    );
165    let line_endings = collect_line_endings(text);
166
167    render_diagnostic_tokens(&reportable_tokens, &line_endings)
168}
169
170/// Extracts matched text using token-span mode instead of whole-line mode.
171///
172/// This is used for files with very long lines (e.g., minified JS) where
173/// whole-line extraction would return megabytes of text for a small match.
174/// Instead, it returns only the tokens within the matched span, producing
175/// output similar to `matched_text_diagnostics_from_text()` but without
176/// the diagnostic `[bracket]` wrapping.
177///
178/// Falls back to `matched_text_from_text()` if token positions are unavailable.
179pub fn matched_text_from_tokens(
180    text: &str,
181    query: &Query<'_>,
182    matched_positions: &PositionSet,
183    start_pos: usize,
184    end_pos: usize,
185    start_line: usize,
186    end_line: usize,
187) -> String {
188    let tokens = tokenize_matched_text(text, query);
189    let reportable_tokens = collect_reportable_tokens(
190        tokens,
191        matched_positions,
192        start_pos,
193        end_pos,
194        start_line,
195        end_line,
196    );
197    let line_endings = collect_line_endings(text);
198
199    render_plain_tokens(&reportable_tokens, &line_endings)
200}
201
202fn render_plain_tokens(tokens: &[MatchedTextToken], line_endings: &[String]) -> String {
203    let mut rendered = String::new();
204    let mut previous_line: Option<usize> = None;
205
206    for token in tokens {
207        if let Some(prev_line) = previous_line
208            && token.line_num > prev_line
209        {
210            for line in prev_line..token.line_num {
211                if let Some(line_ending) = line_endings.get(line.saturating_sub(1)) {
212                    rendered.push_str(line_ending.as_str());
213                }
214            }
215        }
216
217        let token_value = if token.is_text {
218            token.value.as_str()
219        } else {
220            token
221                .value
222                .strip_suffix("\r\n")
223                .or_else(|| token.value.strip_suffix('\n'))
224                .unwrap_or(token.value.as_str())
225        };
226
227        rendered.push_str(token_value);
228
229        previous_line = Some(token.line_num);
230    }
231
232    rendered
233}
234
235fn tokenize_matched_text(text: &str, query: &Query<'_>) -> Vec<MatchedTextToken> {
236    let mut tokens = Vec::new();
237    let mut pos = 0usize;
238    for (line_num, line) in (1usize..).zip(text.split_inclusive('\n')) {
239        for capture in MATCHED_TEXT_PATTERN.captures_iter(line) {
240            if let Some(token_match) = capture.name("token") {
241                let token_text = token_match.as_str();
242                let retokenized: Vec<String> = QUERY_PATTERN
243                    .find_iter(&token_text.to_lowercase())
244                    .map(|m| m.as_str().to_string())
245                    .filter(|token| !STOPWORDS.contains(token.as_str()))
246                    .collect();
247
248                if retokenized.is_empty() {
249                    tokens.push(MatchedTextToken {
250                        value: token_text.to_string(),
251                        line_num,
252                        pos: None,
253                        is_text: true,
254                        is_matched: false,
255                    });
256                } else if retokenized.len() == 1 {
257                    let token = &retokenized[0];
258                    let token_pos = if query.index.dictionary.get(token).is_some() {
259                        let current_pos = pos;
260                        pos += 1;
261                        Some(current_pos)
262                    } else {
263                        None
264                    };
265
266                    tokens.push(MatchedTextToken {
267                        value: token_text.to_string(),
268                        line_num,
269                        pos: token_pos,
270                        is_text: true,
271                        is_matched: false,
272                    });
273                } else {
274                    for token in retokenized {
275                        let token_pos = if query.index.dictionary.get(&token).is_some() {
276                            let current_pos = pos;
277                            pos += 1;
278                            Some(current_pos)
279                        } else {
280                            None
281                        };
282
283                        tokens.push(MatchedTextToken {
284                            value: token,
285                            line_num,
286                            pos: token_pos,
287                            is_text: true,
288                            is_matched: false,
289                        });
290                    }
291                }
292            } else if let Some(punct_match) = capture.name("punct") {
293                tokens.push(MatchedTextToken {
294                    value: punct_match.as_str().to_string(),
295                    line_num,
296                    pos: None,
297                    is_text: false,
298                    is_matched: false,
299                });
300            }
301        }
302    }
303
304    tokens
305}
306
307fn collect_reportable_tokens(
308    tokens: Vec<MatchedTextToken>,
309    matched_positions: &PositionSet,
310    start_pos: usize,
311    end_pos: usize,
312    start_line: usize,
313    end_line: usize,
314) -> Vec<MatchedTextToken> {
315    let mut reportable = Vec::new();
316    let mut started = false;
317    let mut finished = false;
318    let mut end_real_pos = None;
319    let mut last_real_pos = None;
320
321    for (real_pos, mut token) in tokens.into_iter().enumerate() {
322        if token.line_num < start_line {
323            continue;
324        }
325
326        if token.line_num > end_line {
327            break;
328        }
329
330        let mut is_included = false;
331
332        if token.pos.is_some_and(|pos| matched_positions.contains(pos)) {
333            token.is_matched = true;
334            is_included = true;
335        }
336
337        if !started && token.pos == Some(start_pos) {
338            started = true;
339            is_included = true;
340        }
341
342        if started && !finished {
343            is_included = true;
344        }
345
346        if token.pos == Some(end_pos) {
347            finished = true;
348            started = false;
349            end_real_pos = Some(real_pos);
350        }
351
352        if finished && !started && end_real_pos.is_some() && last_real_pos == end_real_pos {
353            end_real_pos = None;
354            if !token.is_text && !token.value.trim().is_empty() {
355                is_included = true;
356            }
357        }
358
359        last_real_pos = Some(real_pos);
360
361        if is_included {
362            reportable.push(token);
363        }
364    }
365
366    reportable
367}
368
369fn collect_line_endings(text: &str) -> Vec<String> {
370    text.split_inclusive('\n')
371        .map(|line| {
372            if line.ends_with("\r\n") {
373                "\r\n".to_string()
374            } else if line.ends_with('\n') {
375                "\n".to_string()
376            } else {
377                String::new()
378            }
379        })
380        .collect()
381}
382
383fn render_diagnostic_tokens(tokens: &[MatchedTextToken], line_endings: &[String]) -> String {
384    let mut rendered = String::new();
385    let mut previous_line: Option<usize> = None;
386
387    for token in tokens {
388        if let Some(prev_line) = previous_line
389            && token.line_num > prev_line
390        {
391            for line in prev_line..token.line_num {
392                if let Some(line_ending) = line_endings.get(line.saturating_sub(1)) {
393                    rendered.push_str(line_ending.as_str());
394                }
395            }
396        }
397
398        let token_value = if token.is_text {
399            token.value.as_str()
400        } else {
401            token
402                .value
403                .strip_suffix("\r\n")
404                .or_else(|| token.value.strip_suffix('\n'))
405                .unwrap_or(token.value.as_str())
406        };
407
408        if token.is_text && !STOPWORDS.contains(token.value.to_lowercase().as_str()) {
409            if token.is_matched {
410                rendered.push_str(token_value);
411            } else {
412                rendered.push('[');
413                rendered.push_str(token_value);
414                rendered.push(']');
415            }
416        } else {
417            rendered.push_str(token_value);
418        }
419
420        previous_line = Some(token.line_num);
421    }
422
423    rendered
424}
425
426impl<'a> Query<'a> {
427    /// Create a new query from text string and license index.
428    ///
429    /// This tokenizes the input text, looks up each token in the index dictionary,
430    /// and builds the query structures for matching.
431    ///
432    /// # Arguments
433    /// * `text` - The input text to tokenize
434    /// * `index` - The license index containing the token dictionary
435    ///
436    /// # Returns
437    /// A Result containing the Query or an error if binary detection fails
438    ///
439    /// Detection scans file-like text, so this uses Python's
440    /// `build_query(..., text_line_threshold=15)` threshold.
441    const TEXT_LINE_THRESHOLD: usize = 15;
442    const BINARY_LINE_THRESHOLD: usize = 50;
443    const MAX_TOKEN_PER_LINE: usize = 25;
444
445    fn compute_spdx_offset(
446        tokens: &[QueryToken],
447        dictionary: &crate::license_detection::index::dictionary::TokenDictionary,
448    ) -> Option<usize> {
449        let get_known_id = |i: usize| -> Option<TokenId> {
450            match tokens.get(i)? {
451                QueryToken::Known(known) => Some(known.id),
452                _ => None,
453            }
454        };
455
456        let spdx_id = dictionary.get("spdx")?;
457        let license_id = dictionary.get("license")?;
458        let identifier_id = dictionary.get("identifier")?;
459        let licence_id = dictionary.get("licence");
460
461        let licenses_id = dictionary.get("licenses");
462        let nuget_id = dictionary.get("nuget");
463        let org_id = dictionary.get("org");
464
465        let is_spdx_prefix = |ids: [Option<TokenId>; 3]| -> bool {
466            ids.iter().all(|id| id.is_some())
467                && ids[0] == Some(spdx_id)
468                && (ids[1] == Some(license_id) || ids[1] == licence_id)
469                && ids[2] == Some(identifier_id)
470        };
471
472        let is_nuget_prefix = |ids: [Option<TokenId>; 3]| -> bool {
473            licenses_id.is_some()
474                && nuget_id.is_some()
475                && org_id.is_some()
476                && ids[0] == licenses_id
477                && ids[1] == Some(nuget_id.unwrap())
478                && ids[2] == Some(org_id.unwrap())
479        };
480
481        if tokens.len() >= 3 {
482            let first_three = [get_known_id(0), get_known_id(1), get_known_id(2)];
483            if is_spdx_prefix(first_three) || is_nuget_prefix(first_three) {
484                return Some(0);
485            }
486        }
487
488        if tokens.len() >= 4 {
489            let second_three = [get_known_id(1), get_known_id(2), get_known_id(3)];
490            if is_spdx_prefix(second_three) || is_nuget_prefix(second_three) {
491                return Some(1);
492            }
493        }
494
495        if tokens.len() >= 5 {
496            let third_three = [get_known_id(2), get_known_id(3), get_known_id(4)];
497            if is_spdx_prefix(third_three) || is_nuget_prefix(third_three) {
498                return Some(2);
499            }
500        }
501
502        None
503    }
504
505    pub(crate) fn from_extracted_text(
506        text: &str,
507        index: &'a LicenseIndex,
508        binary_derived: bool,
509    ) -> Result<Self, LicenseDetectionError> {
510        Self::from_extracted_text_with_deadline(text, index, binary_derived, None)
511    }
512
513    pub(crate) fn from_extracted_text_with_deadline(
514        text: &str,
515        index: &'a LicenseIndex,
516        binary_derived: bool,
517        deadline: Option<Instant>,
518    ) -> Result<Self, LicenseDetectionError> {
519        let line_threshold = if binary_derived {
520            Self::BINARY_LINE_THRESHOLD
521        } else {
522            Self::TEXT_LINE_THRESHOLD
523        };
524
525        Self::with_source_options(text, index, line_threshold, Some(binary_derived), deadline)
526    }
527
528    /// Iterate over query runs.
529    ///
530    /// Corresponds to Python: `query.query_runs` property iteration
531    pub fn query_runs(&self) -> Vec<QueryRun<'_>> {
532        self.query_run_ranges
533            .iter()
534            .map(|&(start, end)| QueryRun::new(self, start, end))
535            .collect()
536    }
537
538    fn with_source_options(
539        text: &str,
540        index: &'a LicenseIndex,
541        line_threshold: usize,
542        binary_derived: Option<bool>,
543        deadline: Option<Instant>,
544    ) -> Result<Self, LicenseDetectionError> {
545        crate::license_detection::ensure_within_deadline(deadline)?;
546        let is_binary = match binary_derived {
547            Some(is_binary) => is_binary,
548            None => Self::detect_binary(text),
549        };
550        let has_long_lines = Self::detect_long_lines(text);
551
552        let mut tokens = Vec::new();
553        let mut line_by_pos = Vec::new();
554        let mut unknowns_by_pos: HashMap<Option<usize>, usize> = HashMap::new();
555        let mut stopwords_by_pos: HashMap<Option<usize>, usize> = HashMap::new();
556        let mut shorts_and_digits_pos = PositionSet::new();
557        let mut spdx_lines: Vec<(String, usize, usize)> = Vec::new();
558
559        let mut known_pos: Option<usize> = None;
560        let mut started = false;
561        let mut tokens_by_line: Vec<Vec<Option<KnownToken>>> = Vec::new();
562
563        for (current_line, (line_index, line)) in (1usize..).zip(text.lines().enumerate()) {
564            if line_index.is_multiple_of(128) {
565                crate::license_detection::ensure_within_deadline(deadline)?;
566            }
567
568            let line_trimmed = line.trim();
569            let mut line_tokens: Vec<Option<KnownToken>> = Vec::new();
570
571            let mut line_first_known_pos = None;
572
573            let line_query_tokens = tokenize_as_ids(line_trimmed, &index.dictionary);
574
575            for query_token in &line_query_tokens {
576                match query_token {
577                    QueryToken::Known(known_token) => {
578                        known_pos = Some(known_pos.map_or(0, |p| p + 1));
579                        started = true;
580                        tokens.push(known_token.id);
581                        line_by_pos.push(current_line);
582                        line_tokens.push(Some(*known_token));
583
584                        if line_first_known_pos.is_none() {
585                            line_first_known_pos = known_pos;
586                        }
587
588                        if known_token.is_short_or_digit {
589                            let _ = shorts_and_digits_pos.insert(known_pos.unwrap());
590                        }
591                    }
592                    QueryToken::Unknown if !started => {
593                        *unknowns_by_pos.entry(None).or_insert(0) += 1;
594                        line_tokens.push(None);
595                    }
596                    QueryToken::Unknown => {
597                        *unknowns_by_pos.entry(known_pos).or_insert(0) += 1;
598                        line_tokens.push(None);
599                    }
600                    QueryToken::Stopword if !started => {
601                        *stopwords_by_pos.entry(None).or_insert(0) += 1;
602                    }
603                    QueryToken::Stopword => {
604                        *stopwords_by_pos.entry(known_pos).or_insert(0) += 1;
605                    }
606                }
607            }
608
609            let line_last_known_pos = known_pos;
610
611            let spdx_start_offset =
612                Self::compute_spdx_offset(&line_query_tokens, &index.dictionary);
613
614            if let Some(offset) = spdx_start_offset
615                && let Some(line_first_known_pos) = line_first_known_pos
616            {
617                let (spdx_prefix, spdx_expression) = split_spdx_lid(line);
618                let spdx_text = format!("{}{}", spdx_prefix.unwrap_or_default(), spdx_expression);
619                let spdx_start_known_pos = line_first_known_pos + offset;
620
621                if spdx_start_known_pos <= line_last_known_pos.unwrap() {
622                    let spdx_end = line_last_known_pos.unwrap() + 1;
623                    spdx_lines.push((spdx_text, spdx_start_known_pos, spdx_end));
624                }
625            }
626            tokens_by_line.push(line_tokens);
627        }
628
629        crate::license_detection::ensure_within_deadline(deadline)?;
630
631        let high_matchables: PositionSet = tokens
632            .iter()
633            .enumerate()
634            .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Legalese)
635            .map(|(pos, _tid)| pos)
636            .collect();
637
638        let low_matchables: PositionSet = tokens
639            .iter()
640            .enumerate()
641            .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Regular)
642            .map(|(pos, _tid)| pos)
643            .collect();
644
645        let query_runs = Self::compute_query_runs(&tokens_by_line, line_threshold, has_long_lines);
646
647        Ok(Query {
648            text: text.to_string(),
649            tokens,
650            line_by_pos,
651            unknowns_by_pos,
652            stopwords_by_pos,
653            shorts_and_digits_pos,
654            high_matchables,
655            low_matchables,
656            is_binary,
657            query_run_ranges: query_runs,
658            spdx_lines,
659            index,
660        })
661    }
662
663    /// Detect if text is binary content.
664    ///
665    /// Binary detection checks for:
666    /// - Null bytes (0x00)
667    /// - High ratio of non-printable characters
668    ///
669    /// # Arguments
670    /// * `text` - The text to analyze
671    ///
672    /// # Returns
673    /// true if binary, false otherwise
674    ///
675    /// Corresponds to Python: `typecode.get_type().is_binary` usage (lines 123-135)
676    fn detect_binary(text: &str) -> bool {
677        let null_byte_count = text.bytes().filter(|&b| b == 0).count();
678
679        if null_byte_count > 0 {
680            return true;
681        }
682
683        let non_printable_ratio = text
684            .chars()
685            .filter(|&c| {
686                !c.is_ascii() && !c.is_ascii_graphic() && c != '\n' && c != '\r' && c != '\t'
687            })
688            .count() as f64
689            / text.len().max(1) as f64;
690
691        non_printable_ratio > 0.3
692    }
693
694    /// Detect if text has very long lines (for minified JS/CSS).
695    ///
696    /// # Arguments
697    /// * `text` - The text to analyze
698    ///
699    /// # Returns
700    /// true if there are lines with many tokens, false otherwise
701    ///
702    /// Corresponds to Python: `typecode.get_type().is_text_with_long_lines` usage
703    fn detect_long_lines(text: &str) -> bool {
704        text.lines()
705            .any(|line| crate::license_detection::tokenize::count_tokens(line) > 25)
706    }
707
708    fn break_long_lines(lines: &[Vec<Option<KnownToken>>]) -> Vec<Vec<Option<KnownToken>>> {
709        lines
710            .iter()
711            .flat_map(|line| {
712                if line.is_empty() {
713                    return Vec::new();
714                }
715
716                if line.len() <= Self::MAX_TOKEN_PER_LINE {
717                    vec![line.clone()]
718                } else {
719                    line.chunks(Self::MAX_TOKEN_PER_LINE)
720                        .map(|chunk| chunk.to_vec())
721                        .collect()
722                }
723            })
724            .collect()
725    }
726
727    fn compute_query_runs(
728        tokens_by_line: &[Vec<Option<KnownToken>>],
729        line_threshold: usize,
730        has_long_lines: bool,
731    ) -> Vec<(usize, Option<usize>)> {
732        let processed_lines = if has_long_lines {
733            Self::break_long_lines(tokens_by_line)
734        } else {
735            tokens_by_line.to_vec()
736        };
737
738        let mut query_runs = Vec::new();
739        let mut query_run_start = 0usize;
740        let mut query_run_end = None;
741        let mut empty_lines = 0usize;
742        let mut pos = 0usize;
743        let mut query_run_is_all_digit = true;
744
745        for line_tokens in processed_lines {
746            if query_run_end.is_some() && empty_lines >= line_threshold {
747                if !query_run_is_all_digit {
748                    query_runs.push((query_run_start, query_run_end));
749                }
750                query_run_start = pos;
751                query_run_end = None;
752                empty_lines = 0;
753                query_run_is_all_digit = true;
754            }
755
756            if query_run_end.is_none() {
757                query_run_start = pos;
758            }
759
760            if line_tokens.is_empty() {
761                empty_lines += 1;
762                continue;
763            }
764
765            let line_is_all_digit = line_tokens
766                .iter()
767                .all(|token_id| token_id.map(|known| known.is_digit_only).unwrap_or(true));
768            let mut line_has_known_tokens = false;
769            let mut line_has_good_tokens = false;
770
771            for known in line_tokens.into_iter().flatten() {
772                line_has_known_tokens = true;
773                if known.kind == TokenKind::Legalese {
774                    line_has_good_tokens = true;
775                }
776                if !known.is_digit_only {
777                    query_run_is_all_digit = false;
778                }
779                query_run_end = Some(pos);
780                pos += 1;
781            }
782
783            if line_is_all_digit || !line_has_known_tokens {
784                empty_lines += 1;
785                continue;
786            }
787
788            if line_has_good_tokens {
789                empty_lines = 0;
790            } else {
791                empty_lines += 1;
792            }
793        }
794
795        if let Some(end) = query_run_end
796            && !query_run_is_all_digit
797        {
798            query_runs.push((query_run_start, Some(end)));
799        }
800
801        query_runs
802    }
803
804    /// Get the length of the query in tokens.
805    ///
806    /// Get the line number for a token position.
807    ///
808    /// # Arguments
809    /// * `pos` - The token position
810    ///
811    /// # Returns
812    /// The line number (1-based)
813    #[inline]
814    pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
815        self.line_by_pos.get(pos).copied()
816    }
817
818    /// Check if the query is empty (no known tokens).
819    #[inline]
820    pub fn is_empty(&self) -> bool {
821        self.tokens.is_empty()
822    }
823
824    /// Get a query run covering the entire query.
825    ///
826    /// Corresponds to Python: `whole_query_run()` method (lines 306-317)
827    pub fn whole_query_run(&self) -> QueryRun<'a> {
828        QueryRun::whole_query_snapshot(self)
829    }
830
831    /// Subtract matched span positions from matchables.
832    ///
833    /// This removes the positions from both high and low matchables.
834    ///
835    /// # Arguments
836    /// * `span` - The span of positions to subtract
837    ///
838    /// Corresponds to Python: `subtract()` method (lines 328-334)
839    pub fn subtract(&mut self, span: &PositionSpan) {
840        self.high_matchables.remove_span(span);
841        self.low_matchables.remove_span(span);
842    }
843
844    /// Extract matched text for a given line range.
845    ///
846    /// Returns the text from the original input between start_line and end_line
847    /// (both inclusive, 1-indexed).
848    ///
849    /// # Arguments
850    /// * `start_line` - Starting line number (1-indexed)
851    /// * `end_line` - Ending line number (1-indexed)
852    ///
853    /// # Returns
854    /// The matched text, or empty string if lines are out of range
855    ///
856    /// Corresponds to Python: `matched_text()` method in match.py (lines 757-795)
857    pub fn matched_text(&self, start_line: usize, end_line: usize) -> String {
858        matched_text_from_text(&self.text, start_line, end_line)
859    }
860}
861
862#[derive(Debug, Clone)]
863struct WholeQueryRunSnapshot<'a> {
864    index: &'a LicenseIndex,
865    tokens: Vec<TokenId>,
866    line_by_pos: Vec<usize>,
867    high_matchables: PositionSet,
868    low_matchables: PositionSet,
869}
870
871/// A query run is a slice of query tokens identified by a start and end positions.
872///
873/// Query runs break a query into manageable chunks for efficient matching.
874/// They track matchable token positions and support subtraction of matched spans.
875///
876/// Based on Python QueryRun class at:
877/// reference/scancode-toolkit/src/licensedcode/query.py (lines 720-914)
878#[derive(Debug, Clone)]
879pub struct QueryRun<'a> {
880    query: Option<&'a Query<'a>>,
881    whole_query_snapshot: Option<WholeQueryRunSnapshot<'a>>,
882    pub start: usize,
883    pub end: Option<usize>,
884    cached_high_matchables: OnceCell<PositionSet>,
885    cached_low_matchables: OnceCell<PositionSet>,
886    combined_matchables: RefCell<Option<PositionSet>>,
887}
888
889impl<'a> QueryRun<'a> {
890    /// Create a new query run from a query with start and end positions.
891    ///
892    /// # Arguments
893    /// * `query` - The parent query
894    /// * `start` - The start position (inclusive)
895    /// * `end` - The end position (inclusive), or None for an empty run
896    ///
897    /// Corresponds to Python: `QueryRun.__init__()` (lines 735-749)
898    pub fn new(query: &'a Query<'a>, start: usize, end: Option<usize>) -> Self {
899        Self {
900            query: Some(query),
901            whole_query_snapshot: None,
902            start,
903            end,
904            cached_high_matchables: OnceCell::new(),
905            cached_low_matchables: OnceCell::new(),
906            combined_matchables: RefCell::new(None),
907        }
908    }
909
910    fn whole_query_snapshot(query: &Query<'a>) -> Self {
911        let end = if query.is_empty() {
912            None
913        } else {
914            Some(query.tokens.len() - 1)
915        };
916
917        Self {
918            query: None,
919            whole_query_snapshot: Some(WholeQueryRunSnapshot {
920                index: query.index,
921                tokens: query.tokens.clone(),
922                line_by_pos: query.line_by_pos.clone(),
923                high_matchables: query.high_matchables.clone(),
924                low_matchables: query.low_matchables.clone(),
925            }),
926            start: 0,
927            end,
928            cached_high_matchables: OnceCell::new(),
929            cached_low_matchables: OnceCell::new(),
930            combined_matchables: RefCell::new(None),
931        }
932    }
933
934    fn source_tokens(&self) -> &[TokenId] {
935        if let Some(query) = self.query {
936            &query.tokens
937        } else {
938            &self
939                .whole_query_snapshot
940                .as_ref()
941                .expect("snapshot-backed whole query run should have snapshot data")
942                .tokens
943        }
944    }
945
946    fn source_line_by_pos(&self) -> &[usize] {
947        if let Some(query) = self.query {
948            &query.line_by_pos
949        } else {
950            &self
951                .whole_query_snapshot
952                .as_ref()
953                .expect("snapshot-backed whole query run should have snapshot data")
954                .line_by_pos
955        }
956    }
957
958    fn source_high_matchables(&self) -> &PositionSet {
959        if let Some(query) = self.query {
960            &query.high_matchables
961        } else {
962            &self
963                .whole_query_snapshot
964                .as_ref()
965                .expect("snapshot-backed whole query run should have snapshot data")
966                .high_matchables
967        }
968    }
969
970    fn source_low_matchables(&self) -> &PositionSet {
971        if let Some(query) = self.query {
972            &query.low_matchables
973        } else {
974            &self
975                .whole_query_snapshot
976                .as_ref()
977                .expect("snapshot-backed whole query run should have snapshot data")
978                .low_matchables
979        }
980    }
981
982    /// Get the license index used by this query run.
983    pub fn get_index(&self) -> &LicenseIndex {
984        if let Some(query) = self.query {
985            query.index
986        } else {
987            self.whole_query_snapshot
988                .as_ref()
989                .expect("snapshot-backed whole query run should have snapshot data")
990                .index
991        }
992    }
993
994    /// Get the line number for a specific token position.
995    ///
996    /// # Arguments
997    /// * `pos` - Absolute token position in the query
998    ///
999    /// # Returns
1000    /// The line number (1-based), or None if position is out of range
1001    pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
1002        self.source_line_by_pos().get(pos).copied()
1003    }
1004
1005    /// Get the sequence of token IDs for this run.
1006    ///
1007    /// Returns empty slice if end is None.
1008    ///
1009    /// Corresponds to Python: `tokens` property (lines 779-786)
1010    pub fn tokens(&self) -> &[TokenId] {
1011        match self.end {
1012            Some(end) => &self.source_tokens()[self.start..=end],
1013            None => &[],
1014        }
1015    }
1016
1017    /// Iterate over token IDs with their absolute positions.
1018    ///
1019    /// Corresponds to Python: `tokens_with_pos()` method (lines 788-789)
1020    pub fn tokens_with_pos(&self) -> impl Iterator<Item = (usize, TokenId)> + '_ {
1021        self.tokens()
1022            .iter()
1023            .copied()
1024            .enumerate()
1025            .map(|(i, tid)| (self.start + i, tid))
1026    }
1027
1028    /// Check if this query run contains only digit tokens.
1029    ///
1030    /// Corresponds to Python: `is_digits_only()` method (lines 791-796)
1031    pub fn is_digits_only(&self) -> bool {
1032        self.tokens()
1033            .iter()
1034            .all(|&tid| self.get_index().dictionary.is_digit_only_token(tid))
1035    }
1036
1037    /// Check if this query run has matchable tokens.
1038    ///
1039    /// # Arguments
1040    /// * `include_low` - If true, include low-value tokens in the check
1041    /// * `exclude_positions` - Optional set of spans containing positions to exclude
1042    ///
1043    /// Returns true if there are matchable tokens remaining
1044    ///
1045    /// Corresponds to Python: `is_matchable()` method (lines 798-818)
1046    pub fn is_matchable(&self, include_low: bool, exclude_positions: &[PositionSpan]) -> bool {
1047        if self.is_digits_only() {
1048            return false;
1049        }
1050
1051        let matchables = self.matchables(include_low);
1052
1053        if exclude_positions.is_empty() {
1054            return !matchables.is_empty();
1055        }
1056
1057        let mut matchable_set = matchables;
1058        for span in exclude_positions {
1059            matchable_set.remove_span(span);
1060        }
1061
1062        !matchable_set.is_empty()
1063    }
1064
1065    pub fn matchables(&self, include_low: bool) -> PositionSet {
1066        if include_low {
1067            if let Some(ref cached) = *self.combined_matchables.borrow() {
1068                return cached.clone();
1069            }
1070            let combined = self.low_matchables().union(&self.high_matchables());
1071            *self.combined_matchables.borrow_mut() = Some(combined.clone());
1072            combined
1073        } else {
1074            self.high_matchables()
1075        }
1076    }
1077
1078    pub fn matchable_tokens(&self) -> Vec<Option<TokenId>> {
1079        let high_matchables = self.high_matchables();
1080        if high_matchables.is_empty() {
1081            return Vec::new();
1082        }
1083
1084        let matchables = self.matchables(true);
1085        self.tokens_with_pos()
1086            .map(|(pos, tid)| {
1087                if matchables.contains(pos) {
1088                    Some(tid)
1089                } else {
1090                    None
1091                }
1092            })
1093            .collect()
1094    }
1095
1096    pub fn high_matchables(&self) -> PositionSet {
1097        self.cached_high_matchables
1098            .get_or_init(|| {
1099                let start = self.start;
1100                let end = self.end.map(|e| e + 1).unwrap_or(usize::MAX);
1101                let source = self.source_high_matchables();
1102                let live_span = PositionSpan::new(start, end);
1103                source
1104                    .iter()
1105                    .filter(|&pos| live_span.contains(pos))
1106                    .collect()
1107            })
1108            .clone()
1109    }
1110
1111    pub fn low_matchables(&self) -> PositionSet {
1112        self.cached_low_matchables
1113            .get_or_init(|| {
1114                let start = self.start;
1115                let end = self.end.map(|e| e + 1).unwrap_or(usize::MAX);
1116                let source = self.source_low_matchables();
1117                let live_span = PositionSpan::new(start, end);
1118                source
1119                    .iter()
1120                    .filter(|&pos| live_span.contains(pos))
1121                    .collect()
1122            })
1123            .clone()
1124    }
1125}
1126
1127#[cfg(test)]
1128mod test;
provenant/license_detection/query/mod.rs

provenant/license_detection/query/
mod.rs