provenant/license_detection/query/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Query processing - tokenized input for license matching.
5
6use crate::license_detection::index::LicenseIndex;
7use crate::license_detection::index::dictionary::{KnownToken, QueryToken, TokenId, TokenKind};
8use crate::license_detection::models::PositionSpan;
9use crate::license_detection::position_set::PositionSet;
10use crate::license_detection::spdx_lid::split_spdx_lid;
11use crate::license_detection::tokenize::STOPWORDS;
12use crate::license_detection::tokenize::tokenize_as_ids;
13use regex::Regex;
14use std::cell::{OnceCell, RefCell};
15use std::collections::HashMap;
16use std::sync::LazyLock;
17use std::time::Instant;
18
19static QUERY_PATTERN: LazyLock<Regex> =
20    LazyLock::new(|| Regex::new(r"[^_\W]+\+?[^_\W]*").expect("valid query regex"));
21static MATCHED_TEXT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
22    Regex::new(r"(?P<token>[^_\W]+\+?[^_\W]*)|(?P<punct>[_\W\s\+]+[_\W\s]?)")
23        .expect("valid matched text regex")
24});
25
26#[derive(Clone)]
27struct MatchedTextToken {
28    value: String,
29    line_num: usize,
30    pos: Option<usize>,
31    is_text: bool,
32    is_matched: bool,
33}
34
35///
36/// Query holds:
37/// - Known token IDs (tokens existing in the index dictionary)
38/// - Token positions and their corresponding line numbers (line_by_pos)
39/// - Unknown tokens (tokens not in dictionary) tracked per position
40/// - Stopwords tracked per position
41/// - Positions with short/digit-only tokens
42/// - High and low matchable token positions (for tracking what's been matched)
43///
44/// Based on Python Query class at:
45/// reference/scancode-toolkit/src/licensedcode/query.py (lines 155-295)
46#[derive(Debug)]
47pub struct Query<'a> {
48    /// The original input text.
49    ///
50    /// Corresponds to Python: `self.query_string` (line 215)
51    pub text: String,
52
53    /// Token IDs for known tokens (tokens found in the index dictionary)
54    ///
55    /// Corresponds to Python: `self.tokens = []` (line 228)
56    pub tokens: Vec<TokenId>,
57
58    /// Mapping from token position to line number (1-based)
59    ///
60    /// Each token position in `self.tokens` maps to the line number where it appears.
61    /// This is used for match position reporting.
62    ///
63    /// Corresponds to Python: `self.line_by_pos = []` (line 231)
64    pub line_by_pos: Vec<usize>,
65
66    /// Mapping from token position to count of unknown tokens after that position
67    ///
68    /// Unknown tokens are those not found in the dictionary. We track them by
69    /// counting how many unknown tokens appear after each known position.
70    /// Unknown tokens before the first known token are tracked with the key `None`.
71    ///
72    /// Corresponds to Python: `self.unknowns_by_pos = {}` (line 236)
73    pub unknowns_by_pos: HashMap<Option<usize>, usize>,
74
75    /// Mapping from token position to count of stopwords after that position
76    ///
77    /// Similar to unknown_tokens, but for stopwords.
78    ///
79    /// Corresponds to Python: `self.stopwords_by_pos = {}` (line 244)
80    pub stopwords_by_pos: HashMap<Option<usize>, usize>,
81
82    /// Set of positions with single-character or digit-only tokens
83    ///
84    /// These tokens have special handling in matching.
85    ///
86    /// Corresponds to Python: `self.shorts_and_digits_pos = set()` (line 249)
87    pub shorts_and_digits_pos: PositionSet,
88
89    /// High-value matchable token positions (legalese tokens)
90    ///
91    /// These are tokens with ID < len_legalese.
92    ///
93    /// Corresponds to Python: `self.high_matchables` (line 293)
94    pub high_matchables: PositionSet,
95
96    /// Low-value matchable token positions (non-legalese tokens)
97    ///
98    /// These are tokens with ID >= len_legalese.
99    ///
100    /// Corresponds to Python: `self.low_matchables` (line 294)
101    pub low_matchables: PositionSet,
102
103    /// True if the query is detected as binary content
104    ///
105    /// Corresponds to Python: `self.is_binary = False` (line 225)
106    pub is_binary: bool,
107
108    /// Raw query run ranges (start, end) computed during tokenization.
109    ///
110    /// QueryRuns are created on-demand from these ranges.
111    ///
112    /// Corresponds to Python: `self.query_runs = []` (line 274)
113    pub(crate) query_run_ranges: Vec<(usize, Option<usize>)>,
114
115    /// SPDX-License-Identifier lines found during tokenization.
116    ///
117    /// Each tuple is (spdx_text, start_token_pos, end_token_pos).
118    /// Used for creating LicenseMatches with correct token positions.
119    ///
120    /// Corresponds to Python: `self.spdx_lines = []` (line 507)
121    pub spdx_lines: Vec<(String, usize, usize)>,
122
123    /// Reference to the license index for dictionary access and metadata
124    pub index: &'a LicenseIndex,
125}
126
127pub fn matched_text_from_text(text: &str, start_line: usize, end_line: usize) -> String {
128    if start_line == 0 || end_line == 0 || start_line > end_line {
129        return String::new();
130    }
131
132    text.lines()
133        .enumerate()
134        .filter_map(|(idx, line)| {
135            let line_num = idx + 1;
136            if line_num >= start_line && line_num <= end_line {
137                Some(line)
138            } else {
139                None
140            }
141        })
142        .collect::<Vec<_>>()
143        .join("\n")
144}
145
146pub fn matched_text_diagnostics_from_text(
147    text: &str,
148    query: &Query<'_>,
149    matched_positions: &PositionSet,
150    start_pos: usize,
151    end_pos: usize,
152    start_line: usize,
153    end_line: usize,
154) -> String {
155    let tokens = tokenize_matched_text(text, query);
156    let reportable_tokens = collect_reportable_tokens(
157        tokens,
158        matched_positions,
159        start_pos,
160        end_pos,
161        start_line,
162        end_line,
163    );
164    let line_endings = collect_line_endings(text);
165
166    render_diagnostic_tokens(&reportable_tokens, &line_endings)
167}
168
169fn tokenize_matched_text(text: &str, query: &Query<'_>) -> Vec<MatchedTextToken> {
170    let mut tokens = Vec::new();
171    let mut pos = 0usize;
172    let mut line_num = 1usize;
173
174    for line in text.split_inclusive('\n') {
175        for capture in MATCHED_TEXT_PATTERN.captures_iter(line) {
176            if let Some(token_match) = capture.name("token") {
177                let token_text = token_match.as_str();
178                let retokenized: Vec<String> = QUERY_PATTERN
179                    .find_iter(&token_text.to_lowercase())
180                    .map(|m| m.as_str().to_string())
181                    .filter(|token| !STOPWORDS.contains(token.as_str()))
182                    .collect();
183
184                if retokenized.is_empty() {
185                    tokens.push(MatchedTextToken {
186                        value: token_text.to_string(),
187                        line_num,
188                        pos: None,
189                        is_text: true,
190                        is_matched: false,
191                    });
192                } else if retokenized.len() == 1 {
193                    let token = &retokenized[0];
194                    let token_pos = if query.index.dictionary.get(token).is_some() {
195                        let current_pos = pos;
196                        pos += 1;
197                        Some(current_pos)
198                    } else {
199                        None
200                    };
201
202                    tokens.push(MatchedTextToken {
203                        value: token_text.to_string(),
204                        line_num,
205                        pos: token_pos,
206                        is_text: true,
207                        is_matched: false,
208                    });
209                } else {
210                    for token in retokenized {
211                        let token_pos = if query.index.dictionary.get(&token).is_some() {
212                            let current_pos = pos;
213                            pos += 1;
214                            Some(current_pos)
215                        } else {
216                            None
217                        };
218
219                        tokens.push(MatchedTextToken {
220                            value: token,
221                            line_num,
222                            pos: token_pos,
223                            is_text: true,
224                            is_matched: false,
225                        });
226                    }
227                }
228            } else if let Some(punct_match) = capture.name("punct") {
229                tokens.push(MatchedTextToken {
230                    value: punct_match.as_str().to_string(),
231                    line_num,
232                    pos: None,
233                    is_text: false,
234                    is_matched: false,
235                });
236            }
237        }
238
239        line_num += 1;
240    }
241
242    tokens
243}
244
245fn collect_reportable_tokens(
246    tokens: Vec<MatchedTextToken>,
247    matched_positions: &PositionSet,
248    start_pos: usize,
249    end_pos: usize,
250    start_line: usize,
251    end_line: usize,
252) -> Vec<MatchedTextToken> {
253    let mut reportable = Vec::new();
254    let mut started = false;
255    let mut finished = false;
256    let mut end_real_pos = None;
257    let mut last_real_pos = None;
258
259    for (real_pos, mut token) in tokens.into_iter().enumerate() {
260        if token.line_num < start_line {
261            continue;
262        }
263
264        if token.line_num > end_line {
265            break;
266        }
267
268        let mut is_included = false;
269
270        if token.pos.is_some_and(|pos| matched_positions.contains(pos)) {
271            token.is_matched = true;
272            is_included = true;
273        }
274
275        if !started && token.pos == Some(start_pos) {
276            started = true;
277            is_included = true;
278        }
279
280        if started && !finished {
281            is_included = true;
282        }
283
284        if token.pos == Some(end_pos) {
285            finished = true;
286            started = false;
287            end_real_pos = Some(real_pos);
288        }
289
290        if finished && !started && end_real_pos.is_some() && last_real_pos == end_real_pos {
291            end_real_pos = None;
292            if !token.is_text && !token.value.trim().is_empty() {
293                is_included = true;
294            }
295        }
296
297        last_real_pos = Some(real_pos);
298
299        if is_included {
300            reportable.push(token);
301        }
302    }
303
304    reportable
305}
306
307fn collect_line_endings(text: &str) -> Vec<String> {
308    text.split_inclusive('\n')
309        .map(|line| {
310            if line.ends_with("\r\n") {
311                "\r\n".to_string()
312            } else if line.ends_with('\n') {
313                "\n".to_string()
314            } else {
315                String::new()
316            }
317        })
318        .collect()
319}
320
321fn render_diagnostic_tokens(tokens: &[MatchedTextToken], line_endings: &[String]) -> String {
322    let mut rendered = String::new();
323    let mut previous_line: Option<usize> = None;
324
325    for token in tokens {
326        if let Some(prev_line) = previous_line
327            && token.line_num > prev_line
328        {
329            for line in prev_line..token.line_num {
330                if let Some(line_ending) = line_endings.get(line.saturating_sub(1)) {
331                    rendered.push_str(line_ending.as_str());
332                }
333            }
334        }
335
336        let token_value = if token.is_text {
337            token.value.as_str()
338        } else {
339            token
340                .value
341                .strip_suffix("\r\n")
342                .or_else(|| token.value.strip_suffix('\n'))
343                .unwrap_or(token.value.as_str())
344        };
345
346        if token.is_text && !STOPWORDS.contains(token.value.to_lowercase().as_str()) {
347            if token.is_matched {
348                rendered.push_str(token_value);
349            } else {
350                rendered.push('[');
351                rendered.push_str(token_value);
352                rendered.push(']');
353            }
354        } else {
355            rendered.push_str(token_value);
356        }
357
358        previous_line = Some(token.line_num);
359    }
360
361    rendered
362}
363
364impl<'a> Query<'a> {
365    /// Create a new query from text string and license index.
366    ///
367    /// This tokenizes the input text, looks up each token in the index dictionary,
368    /// and builds the query structures for matching.
369    ///
370    /// # Arguments
371    /// * `text` - The input text to tokenize
372    /// * `index` - The license index containing the token dictionary
373    ///
374    /// # Returns
375    /// A Result containing the Query or an error if binary detection fails
376    ///
377    /// Detection scans file-like text, so this uses Python's
378    /// `build_query(..., text_line_threshold=15)` threshold.
379    const TEXT_LINE_THRESHOLD: usize = 15;
380    const BINARY_LINE_THRESHOLD: usize = 50;
381    const MAX_TOKEN_PER_LINE: usize = 25;
382
383    fn compute_spdx_offset(
384        tokens: &[QueryToken],
385        dictionary: &crate::license_detection::index::dictionary::TokenDictionary,
386    ) -> Option<usize> {
387        let get_known_id = |i: usize| -> Option<TokenId> {
388            match tokens.get(i)? {
389                QueryToken::Known(known) => Some(known.id),
390                _ => None,
391            }
392        };
393
394        let spdx_id = dictionary.get("spdx")?;
395        let license_id = dictionary.get("license")?;
396        let identifier_id = dictionary.get("identifier")?;
397        let licence_id = dictionary.get("licence");
398
399        let licenses_id = dictionary.get("licenses");
400        let nuget_id = dictionary.get("nuget");
401        let org_id = dictionary.get("org");
402
403        let is_spdx_prefix = |ids: [Option<TokenId>; 3]| -> bool {
404            ids.iter().all(|id| id.is_some())
405                && ids[0] == Some(spdx_id)
406                && (ids[1] == Some(license_id) || ids[1] == licence_id)
407                && ids[2] == Some(identifier_id)
408        };
409
410        let is_nuget_prefix = |ids: [Option<TokenId>; 3]| -> bool {
411            licenses_id.is_some()
412                && nuget_id.is_some()
413                && org_id.is_some()
414                && ids[0] == licenses_id
415                && ids[1] == Some(nuget_id.unwrap())
416                && ids[2] == Some(org_id.unwrap())
417        };
418
419        if tokens.len() >= 3 {
420            let first_three = [get_known_id(0), get_known_id(1), get_known_id(2)];
421            if is_spdx_prefix(first_three) || is_nuget_prefix(first_three) {
422                return Some(0);
423            }
424        }
425
426        if tokens.len() >= 4 {
427            let second_three = [get_known_id(1), get_known_id(2), get_known_id(3)];
428            if is_spdx_prefix(second_three) || is_nuget_prefix(second_three) {
429                return Some(1);
430            }
431        }
432
433        if tokens.len() >= 5 {
434            let third_three = [get_known_id(2), get_known_id(3), get_known_id(4)];
435            if is_spdx_prefix(third_three) || is_nuget_prefix(third_three) {
436                return Some(2);
437            }
438        }
439
440        None
441    }
442
443    pub fn from_extracted_text(
444        text: &str,
445        index: &'a LicenseIndex,
446        binary_derived: bool,
447    ) -> Result<Self, anyhow::Error> {
448        Self::from_extracted_text_with_deadline(text, index, binary_derived, None)
449    }
450
451    pub fn from_extracted_text_with_deadline(
452        text: &str,
453        index: &'a LicenseIndex,
454        binary_derived: bool,
455        deadline: Option<Instant>,
456    ) -> Result<Self, anyhow::Error> {
457        let line_threshold = if binary_derived {
458            Self::BINARY_LINE_THRESHOLD
459        } else {
460            Self::TEXT_LINE_THRESHOLD
461        };
462
463        Self::with_source_options(text, index, line_threshold, Some(binary_derived), deadline)
464    }
465
466    /// Iterate over query runs.
467    ///
468    /// Corresponds to Python: `query.query_runs` property iteration
469    pub fn query_runs(&self) -> Vec<QueryRun<'_>> {
470        self.query_run_ranges
471            .iter()
472            .map(|&(start, end)| QueryRun::new(self, start, end))
473            .collect()
474    }
475
476    fn with_source_options(
477        text: &str,
478        index: &'a LicenseIndex,
479        line_threshold: usize,
480        binary_derived: Option<bool>,
481        deadline: Option<Instant>,
482    ) -> Result<Self, anyhow::Error> {
483        crate::license_detection::ensure_within_deadline(deadline)?;
484        let is_binary = match binary_derived {
485            Some(is_binary) => is_binary,
486            None => Self::detect_binary(text)?,
487        };
488        let has_long_lines = Self::detect_long_lines(text);
489
490        let mut tokens = Vec::new();
491        let mut line_by_pos = Vec::new();
492        let mut unknowns_by_pos: HashMap<Option<usize>, usize> = HashMap::new();
493        let mut stopwords_by_pos: HashMap<Option<usize>, usize> = HashMap::new();
494        let mut shorts_and_digits_pos = PositionSet::new();
495        let mut spdx_lines: Vec<(String, usize, usize)> = Vec::new();
496
497        let mut known_pos: Option<usize> = None;
498        let mut started = false;
499        let mut current_line = 1usize;
500
501        let mut tokens_by_line: Vec<Vec<Option<KnownToken>>> = Vec::new();
502
503        for (line_index, line) in text.lines().enumerate() {
504            if line_index.is_multiple_of(128) {
505                crate::license_detection::ensure_within_deadline(deadline)?;
506            }
507
508            let line_trimmed = line.trim();
509            let mut line_tokens: Vec<Option<KnownToken>> = Vec::new();
510
511            let mut line_first_known_pos = None;
512
513            let line_query_tokens = tokenize_as_ids(line_trimmed, &index.dictionary);
514
515            for query_token in &line_query_tokens {
516                match query_token {
517                    QueryToken::Known(known_token) => {
518                        known_pos = Some(known_pos.map_or(0, |p| p + 1));
519                        started = true;
520                        tokens.push(known_token.id);
521                        line_by_pos.push(current_line);
522                        line_tokens.push(Some(*known_token));
523
524                        if line_first_known_pos.is_none() {
525                            line_first_known_pos = known_pos;
526                        }
527
528                        if known_token.is_short_or_digit {
529                            let _ = shorts_and_digits_pos.insert(known_pos.unwrap());
530                        }
531                    }
532                    QueryToken::Unknown if !started => {
533                        *unknowns_by_pos.entry(None).or_insert(0) += 1;
534                        line_tokens.push(None);
535                    }
536                    QueryToken::Unknown => {
537                        *unknowns_by_pos.entry(known_pos).or_insert(0) += 1;
538                        line_tokens.push(None);
539                    }
540                    QueryToken::Stopword if !started => {
541                        *stopwords_by_pos.entry(None).or_insert(0) += 1;
542                    }
543                    QueryToken::Stopword => {
544                        *stopwords_by_pos.entry(known_pos).or_insert(0) += 1;
545                    }
546                }
547            }
548
549            let line_last_known_pos = known_pos;
550
551            let spdx_start_offset =
552                Self::compute_spdx_offset(&line_query_tokens, &index.dictionary);
553
554            if let Some(offset) = spdx_start_offset
555                && let Some(line_first_known_pos) = line_first_known_pos
556            {
557                let (spdx_prefix, spdx_expression) = split_spdx_lid(line);
558                let spdx_text = format!("{}{}", spdx_prefix.unwrap_or_default(), spdx_expression);
559                let spdx_start_known_pos = line_first_known_pos + offset;
560
561                if spdx_start_known_pos <= line_last_known_pos.unwrap() {
562                    let spdx_end = line_last_known_pos.unwrap() + 1;
563                    spdx_lines.push((spdx_text, spdx_start_known_pos, spdx_end));
564                }
565            }
566
567            tokens_by_line.push(line_tokens);
568            current_line += 1;
569        }
570
571        crate::license_detection::ensure_within_deadline(deadline)?;
572
573        let high_matchables: PositionSet = tokens
574            .iter()
575            .enumerate()
576            .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Legalese)
577            .map(|(pos, _tid)| pos)
578            .collect();
579
580        let low_matchables: PositionSet = tokens
581            .iter()
582            .enumerate()
583            .filter(|(_pos, tid)| index.dictionary.token_kind(**tid) == TokenKind::Regular)
584            .map(|(pos, _tid)| pos)
585            .collect();
586
587        let query_runs = Self::compute_query_runs(&tokens_by_line, line_threshold, has_long_lines);
588
589        Ok(Query {
590            text: text.to_string(),
591            tokens,
592            line_by_pos,
593            unknowns_by_pos,
594            stopwords_by_pos,
595            shorts_and_digits_pos,
596            high_matchables,
597            low_matchables,
598            is_binary,
599            query_run_ranges: query_runs,
600            spdx_lines,
601            index,
602        })
603    }
604
605    /// Detect if text is binary content.
606    ///
607    /// Binary detection checks for:
608    /// - Null bytes (0x00)
609    /// - High ratio of non-printable characters
610    ///
611    /// # Arguments
612    /// * `text` - The text to analyze
613    ///
614    /// # Returns
615    /// true if binary, false otherwise
616    ///
617    /// Corresponds to Python: `typecode.get_type().is_binary` usage (lines 123-135)
618    fn detect_binary(text: &str) -> Result<bool, anyhow::Error> {
619        let null_byte_count = text.bytes().filter(|&b| b == 0).count();
620
621        if null_byte_count > 0 {
622            return Ok(true);
623        }
624
625        let non_printable_ratio = text
626            .chars()
627            .filter(|&c| {
628                !c.is_ascii() && !c.is_ascii_graphic() && c != '\n' && c != '\r' && c != '\t'
629            })
630            .count() as f64
631            / text.len().max(1) as f64;
632
633        Ok(non_printable_ratio > 0.3)
634    }
635
636    /// Detect if text has very long lines (for minified JS/CSS).
637    ///
638    /// # Arguments
639    /// * `text` - The text to analyze
640    ///
641    /// # Returns
642    /// true if there are lines with many tokens, false otherwise
643    ///
644    /// Corresponds to Python: `typecode.get_type().is_text_with_long_lines` usage
645    fn detect_long_lines(text: &str) -> bool {
646        text.lines()
647            .any(|line| crate::license_detection::tokenize::count_tokens(line) > 25)
648    }
649
650    fn break_long_lines(lines: &[Vec<Option<KnownToken>>]) -> Vec<Vec<Option<KnownToken>>> {
651        lines
652            .iter()
653            .flat_map(|line| {
654                if line.is_empty() {
655                    return Vec::new();
656                }
657
658                if line.len() <= Self::MAX_TOKEN_PER_LINE {
659                    vec![line.clone()]
660                } else {
661                    line.chunks(Self::MAX_TOKEN_PER_LINE)
662                        .map(|chunk| chunk.to_vec())
663                        .collect()
664                }
665            })
666            .collect()
667    }
668
669    fn compute_query_runs(
670        tokens_by_line: &[Vec<Option<KnownToken>>],
671        line_threshold: usize,
672        has_long_lines: bool,
673    ) -> Vec<(usize, Option<usize>)> {
674        let processed_lines = if has_long_lines {
675            Self::break_long_lines(tokens_by_line)
676        } else {
677            tokens_by_line.to_vec()
678        };
679
680        let mut query_runs = Vec::new();
681        let mut query_run_start = 0usize;
682        let mut query_run_end = None;
683        let mut empty_lines = 0usize;
684        let mut pos = 0usize;
685        let mut query_run_is_all_digit = true;
686
687        for line_tokens in processed_lines {
688            if query_run_end.is_some() && empty_lines >= line_threshold {
689                if !query_run_is_all_digit {
690                    query_runs.push((query_run_start, query_run_end));
691                }
692                query_run_start = pos;
693                query_run_end = None;
694                empty_lines = 0;
695                query_run_is_all_digit = true;
696            }
697
698            if query_run_end.is_none() {
699                query_run_start = pos;
700            }
701
702            if line_tokens.is_empty() {
703                empty_lines += 1;
704                continue;
705            }
706
707            let line_is_all_digit = line_tokens
708                .iter()
709                .all(|token_id| token_id.map(|known| known.is_digit_only).unwrap_or(true));
710            let mut line_has_known_tokens = false;
711            let mut line_has_good_tokens = false;
712
713            for known in line_tokens.into_iter().flatten() {
714                line_has_known_tokens = true;
715                if known.kind == TokenKind::Legalese {
716                    line_has_good_tokens = true;
717                }
718                if !known.is_digit_only {
719                    query_run_is_all_digit = false;
720                }
721                query_run_end = Some(pos);
722                pos += 1;
723            }
724
725            if line_is_all_digit || !line_has_known_tokens {
726                empty_lines += 1;
727                continue;
728            }
729
730            if line_has_good_tokens {
731                empty_lines = 0;
732            } else {
733                empty_lines += 1;
734            }
735        }
736
737        if let Some(end) = query_run_end
738            && !query_run_is_all_digit
739        {
740            query_runs.push((query_run_start, Some(end)));
741        }
742
743        query_runs
744    }
745
746    /// Get the length of the query in tokens.
747    ///
748    /// Get the line number for a token position.
749    ///
750    /// # Arguments
751    /// * `pos` - The token position
752    ///
753    /// # Returns
754    /// The line number (1-based)
755    #[inline]
756    pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
757        self.line_by_pos.get(pos).copied()
758    }
759
760    /// Check if the query is empty (no known tokens).
761    #[inline]
762    pub fn is_empty(&self) -> bool {
763        self.tokens.is_empty()
764    }
765
766    /// Get a query run covering the entire query.
767    ///
768    /// Corresponds to Python: `whole_query_run()` method (lines 306-317)
769    pub fn whole_query_run(&self) -> QueryRun<'a> {
770        QueryRun::whole_query_snapshot(self)
771    }
772
773    /// Subtract matched span positions from matchables.
774    ///
775    /// This removes the positions from both high and low matchables.
776    ///
777    /// # Arguments
778    /// * `span` - The span of positions to subtract
779    ///
780    /// Corresponds to Python: `subtract()` method (lines 328-334)
781    pub fn subtract(&mut self, span: &PositionSpan) {
782        self.high_matchables.remove_span(span);
783        self.low_matchables.remove_span(span);
784    }
785
786    /// Extract matched text for a given line range.
787    ///
788    /// Returns the text from the original input between start_line and end_line
789    /// (both inclusive, 1-indexed).
790    ///
791    /// # Arguments
792    /// * `start_line` - Starting line number (1-indexed)
793    /// * `end_line` - Ending line number (1-indexed)
794    ///
795    /// # Returns
796    /// The matched text, or empty string if lines are out of range
797    ///
798    /// Corresponds to Python: `matched_text()` method in match.py (lines 757-795)
799    pub fn matched_text(&self, start_line: usize, end_line: usize) -> String {
800        matched_text_from_text(&self.text, start_line, end_line)
801    }
802}
803
804#[derive(Debug, Clone)]
805struct WholeQueryRunSnapshot<'a> {
806    index: &'a LicenseIndex,
807    tokens: Vec<TokenId>,
808    line_by_pos: Vec<usize>,
809    high_matchables: PositionSet,
810    low_matchables: PositionSet,
811}
812
813/// A query run is a slice of query tokens identified by a start and end positions.
814///
815/// Query runs break a query into manageable chunks for efficient matching.
816/// They track matchable token positions and support subtraction of matched spans.
817///
818/// Based on Python QueryRun class at:
819/// reference/scancode-toolkit/src/licensedcode/query.py (lines 720-914)
820#[derive(Debug, Clone)]
821pub struct QueryRun<'a> {
822    query: Option<&'a Query<'a>>,
823    whole_query_snapshot: Option<WholeQueryRunSnapshot<'a>>,
824    pub start: usize,
825    pub end: Option<usize>,
826    cached_high_matchables: OnceCell<PositionSet>,
827    cached_low_matchables: OnceCell<PositionSet>,
828    combined_matchables: RefCell<Option<PositionSet>>,
829}
830
831impl<'a> QueryRun<'a> {
832    /// Create a new query run from a query with start and end positions.
833    ///
834    /// # Arguments
835    /// * `query` - The parent query
836    /// * `start` - The start position (inclusive)
837    /// * `end` - The end position (inclusive), or None for an empty run
838    ///
839    /// Corresponds to Python: `QueryRun.__init__()` (lines 735-749)
840    pub fn new(query: &'a Query<'a>, start: usize, end: Option<usize>) -> Self {
841        Self {
842            query: Some(query),
843            whole_query_snapshot: None,
844            start,
845            end,
846            cached_high_matchables: OnceCell::new(),
847            cached_low_matchables: OnceCell::new(),
848            combined_matchables: RefCell::new(None),
849        }
850    }
851
852    fn whole_query_snapshot(query: &Query<'a>) -> Self {
853        let end = if query.is_empty() {
854            None
855        } else {
856            Some(query.tokens.len() - 1)
857        };
858
859        Self {
860            query: None,
861            whole_query_snapshot: Some(WholeQueryRunSnapshot {
862                index: query.index,
863                tokens: query.tokens.clone(),
864                line_by_pos: query.line_by_pos.clone(),
865                high_matchables: query.high_matchables.clone(),
866                low_matchables: query.low_matchables.clone(),
867            }),
868            start: 0,
869            end,
870            cached_high_matchables: OnceCell::new(),
871            cached_low_matchables: OnceCell::new(),
872            combined_matchables: RefCell::new(None),
873        }
874    }
875
876    fn source_tokens(&self) -> &[TokenId] {
877        if let Some(query) = self.query {
878            &query.tokens
879        } else {
880            &self
881                .whole_query_snapshot
882                .as_ref()
883                .expect("snapshot-backed whole query run should have snapshot data")
884                .tokens
885        }
886    }
887
888    fn source_line_by_pos(&self) -> &[usize] {
889        if let Some(query) = self.query {
890            &query.line_by_pos
891        } else {
892            &self
893                .whole_query_snapshot
894                .as_ref()
895                .expect("snapshot-backed whole query run should have snapshot data")
896                .line_by_pos
897        }
898    }
899
900    fn source_high_matchables(&self) -> &PositionSet {
901        if let Some(query) = self.query {
902            &query.high_matchables
903        } else {
904            &self
905                .whole_query_snapshot
906                .as_ref()
907                .expect("snapshot-backed whole query run should have snapshot data")
908                .high_matchables
909        }
910    }
911
912    fn source_low_matchables(&self) -> &PositionSet {
913        if let Some(query) = self.query {
914            &query.low_matchables
915        } else {
916            &self
917                .whole_query_snapshot
918                .as_ref()
919                .expect("snapshot-backed whole query run should have snapshot data")
920                .low_matchables
921        }
922    }
923
924    /// Get the license index used by this query run.
925    pub fn get_index(&self) -> &LicenseIndex {
926        if let Some(query) = self.query {
927            query.index
928        } else {
929            self.whole_query_snapshot
930                .as_ref()
931                .expect("snapshot-backed whole query run should have snapshot data")
932                .index
933        }
934    }
935
936    /// Get the line number for a specific token position.
937    ///
938    /// # Arguments
939    /// * `pos` - Absolute token position in the query
940    ///
941    /// # Returns
942    /// The line number (1-based), or None if position is out of range
943    pub fn line_for_pos(&self, pos: usize) -> Option<usize> {
944        self.source_line_by_pos().get(pos).copied()
945    }
946
947    /// Get the sequence of token IDs for this run.
948    ///
949    /// Returns empty slice if end is None.
950    ///
951    /// Corresponds to Python: `tokens` property (lines 779-786)
952    pub fn tokens(&self) -> &[TokenId] {
953        match self.end {
954            Some(end) => &self.source_tokens()[self.start..=end],
955            None => &[],
956        }
957    }
958
959    /// Iterate over token IDs with their absolute positions.
960    ///
961    /// Corresponds to Python: `tokens_with_pos()` method (lines 788-789)
962    pub fn tokens_with_pos(&self) -> impl Iterator<Item = (usize, TokenId)> + '_ {
963        self.tokens()
964            .iter()
965            .copied()
966            .enumerate()
967            .map(|(i, tid)| (self.start + i, tid))
968    }
969
970    /// Check if this query run contains only digit tokens.
971    ///
972    /// Corresponds to Python: `is_digits_only()` method (lines 791-796)
973    pub fn is_digits_only(&self) -> bool {
974        self.tokens()
975            .iter()
976            .all(|&tid| self.get_index().dictionary.is_digit_only_token(tid))
977    }
978
979    /// Check if this query run has matchable tokens.
980    ///
981    /// # Arguments
982    /// * `include_low` - If true, include low-value tokens in the check
983    /// * `exclude_positions` - Optional set of spans containing positions to exclude
984    ///
985    /// Returns true if there are matchable tokens remaining
986    ///
987    /// Corresponds to Python: `is_matchable()` method (lines 798-818)
988    pub fn is_matchable(&self, include_low: bool, exclude_positions: &[PositionSpan]) -> bool {
989        if self.is_digits_only() {
990            return false;
991        }
992
993        let matchables = self.matchables(include_low);
994
995        if exclude_positions.is_empty() {
996            return !matchables.is_empty();
997        }
998
999        let mut matchable_set = matchables;
1000        for span in exclude_positions {
1001            matchable_set.remove_span(span);
1002        }
1003
1004        !matchable_set.is_empty()
1005    }
1006
1007    pub fn matchables(&self, include_low: bool) -> PositionSet {
1008        if include_low {
1009            if let Some(ref cached) = *self.combined_matchables.borrow() {
1010                return cached.clone();
1011            }
1012            let combined = self.low_matchables().union(&self.high_matchables());
1013            *self.combined_matchables.borrow_mut() = Some(combined.clone());
1014            combined
1015        } else {
1016            self.high_matchables()
1017        }
1018    }
1019
1020    pub fn matchable_tokens(&self) -> Vec<Option<TokenId>> {
1021        let high_matchables = self.high_matchables();
1022        if high_matchables.is_empty() {
1023            return Vec::new();
1024        }
1025
1026        let matchables = self.matchables(true);
1027        self.tokens_with_pos()
1028            .map(|(pos, tid)| {
1029                if matchables.contains(pos) {
1030                    Some(tid)
1031                } else {
1032                    None
1033                }
1034            })
1035            .collect()
1036    }
1037
1038    pub fn high_matchables(&self) -> PositionSet {
1039        self.cached_high_matchables
1040            .get_or_init(|| {
1041                let start = self.start;
1042                let end = self.end.map(|e| e + 1).unwrap_or(usize::MAX);
1043                let source = self.source_high_matchables();
1044                let live_span = PositionSpan::new(start, end);
1045                source
1046                    .iter()
1047                    .filter(|&pos| live_span.contains(pos))
1048                    .collect()
1049            })
1050            .clone()
1051    }
1052
1053    pub fn low_matchables(&self) -> PositionSet {
1054        self.cached_low_matchables
1055            .get_or_init(|| {
1056                let start = self.start;
1057                let end = self.end.map(|e| e + 1).unwrap_or(usize::MAX);
1058                let source = self.source_low_matchables();
1059                let live_span = PositionSpan::new(start, end);
1060                source
1061                    .iter()
1062                    .filter(|&pos| live_span.contains(pos))
1063                    .collect()
1064            })
1065            .clone()
1066    }
1067}
1068
1069#[cfg(test)]
1070mod test;
provenant/license_detection/query/mod.rs

provenant/license_detection/query/
mod.rs