Skip to main content

provenant/license_detection/
mod.rs

1//! License Detection Engine
2
3pub mod aho_match;
4pub mod automaton;
5pub(crate) mod detection;
6pub mod embedded;
7mod position_set;
8mod token_multiset;
9mod token_set;
10
11#[cfg(test)]
12mod embedded_test;
13pub mod expression;
14#[cfg(test)]
15mod golden_test;
16#[cfg(feature = "golden-tests")]
17pub mod golden_utils;
18pub mod hash_match;
19pub mod index;
20mod match_refine;
21pub mod models;
22pub mod query;
23pub mod rules;
24pub mod seq_match;
25pub mod spdx_lid;
26pub mod spdx_mapping;
27#[cfg(test)]
28mod test_utils;
29pub mod tokenize;
30pub mod unknown_match;
31
32use bit_set::BitSet;
33use std::collections::HashSet;
34use std::path::Path;
35use std::sync::Arc;
36
37use anyhow::Result;
38
39use crate::license_detection::embedded::index::load_license_index_from_bytes;
40use crate::license_detection::index::build_index_from_loaded;
41use crate::license_detection::query::Query;
42use crate::license_detection::rules::{
43    load_loaded_licenses_from_directory, load_loaded_rules_from_directory,
44};
45use crate::license_detection::spdx_mapping::{SpdxMapping, build_spdx_mapping};
46use crate::utils::text::strip_utf8_bom_str;
47
48use crate::license_detection::detection::{
49    attach_source_path_to_detections, empty_detection, populate_detection_from_group_with_spdx,
50};
51use crate::license_detection::models::MatcherKind;
52
53/// Path to the license rules directory in the reference scancode-toolkit submodule.
54/// Used by test code and the xtask generate-license-loader-artifact binary.
55#[allow(dead_code)]
56pub const SCANCODE_LICENSES_RULES_PATH: &str =
57    "reference/scancode-toolkit/src/licensedcode/data/rules";
58
59/// Path to the licenses directory in the reference scancode-toolkit submodule.
60/// Used by test code and the xtask generate-license-loader-artifact binary.
61#[allow(dead_code)]
62pub const SCANCODE_LICENSES_LICENSES_PATH: &str =
63    "reference/scancode-toolkit/src/licensedcode/data/licenses";
64
65/// Path to the license data directory in the reference scancode-toolkit submodule.
66/// Used by test code and the xtask generate-license-loader-artifact binary.
67#[allow(dead_code)]
68pub const SCANCODE_LICENSES_DATA_PATH: &str = "reference/scancode-toolkit/src/licensedcode/data";
69
70pub const DEFAULT_LICENSEDB_URL_TEMPLATE: &str = "https://scancode-licensedb.aboutcode.org/{}";
71
72pub(crate) use detection::{
73    LicenseDetection, group_matches_by_region, post_process_detections, sort_matches_by_line,
74};
75pub use models::LicenseMatch;
76
77pub use aho_match::aho_match;
78pub use hash_match::hash_match;
79pub use match_refine::{
80    filter_invalid_contained_unknown_matches, merge_overlapping_matches, refine_matches,
81    refine_matches_without_false_positive_filter, split_weak_matches,
82};
83pub use position_set::PositionSet;
84pub use spdx_lid::spdx_lid_match;
85pub use token_multiset::TokenMultiset;
86pub use token_set::TokenSet;
87pub use unknown_match::unknown_match;
88
89use self::seq_match::{MAX_NEAR_DUPE_CANDIDATES, select_seq_candidates, seq_match_with_candidates};
90
91/// License detection engine that orchestrates the detection pipeline.
92///
93/// The engine loads license rules and builds an index for efficient matching.
94/// It supports multiple matching strategies (hash, SPDX-LID, Aho-Corasick, sequence)
95/// and combines their results into final license detections.
96#[derive(Debug, Clone)]
97pub struct LicenseDetectionEngine {
98    index: Arc<index::LicenseIndex>,
99    spdx_mapping: SpdxMapping,
100}
101
102const MAX_DETECTION_SIZE: usize = 10 * 1024 * 1024; // 10MB
103const MAX_REGULAR_SEQ_CANDIDATES: usize = 70;
104const MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP: usize = 8;
105const MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP: usize = 2;
106
107fn truncate_detection_text(clean_text: &str) -> &str {
108    if clean_text.len() <= MAX_DETECTION_SIZE {
109        return clean_text;
110    }
111
112    log::debug!(
113        "Content size {} exceeds limit {}, truncating for detection",
114        clean_text.len(),
115        MAX_DETECTION_SIZE
116    );
117
118    let boundary = clean_text.floor_char_boundary(MAX_DETECTION_SIZE);
119    &clean_text[..boundary]
120}
121
122fn query_span_for_match(m: &LicenseMatch) -> Option<models::PositionSpan> {
123    (!m.query_span().is_empty()).then(|| m.query_span().clone())
124}
125
126fn has_full_match_coverage(m: &LicenseMatch) -> bool {
127    m.coverage() == 100.0
128}
129
130fn is_redundant_same_expression_seq_container(
131    container: &LicenseMatch,
132    candidate_contained_matches: &[LicenseMatch],
133) -> bool {
134    let container_is_redundant_coverage =
135        has_full_match_coverage(container) || container.coverage() >= 99.0;
136    if container.matcher != MatcherKind::Seq || !container_is_redundant_coverage {
137        return false;
138    }
139
140    let container_qspan_set = container.qspan_set();
141
142    let mut contained: Vec<&LicenseMatch> = candidate_contained_matches
143        .iter()
144        .filter(|m| {
145            m.matcher == MatcherKind::Aho
146                && has_full_match_coverage(m)
147                && m.license_expression == container.license_expression
148                && m.overlaps_with(&container_qspan_set)
149        })
150        .collect();
151
152    if contained.len() < 2 {
153        return false;
154    }
155
156    let material_children = contained.iter().filter(|m| m.matched_length > 1).count();
157    if material_children < 2 {
158        return false;
159    }
160
161    contained.sort_by_key(|m| m.qspan_bounds());
162
163    let mut child_union = PositionSet::new();
164    for m in &contained {
165        child_union.extend_from_span(m.query_span());
166    }
167
168    let container_only_positions = container_qspan_set.difference(&child_union);
169    let child_only_positions = child_union.difference(&container_qspan_set);
170
171    let mut bridge_positions = BitSet::new();
172    for pair in contained.windows(2) {
173        let (_, previous_end) = pair[0].qspan_bounds();
174        let (next_start, _) = pair[1].qspan_bounds();
175
176        if next_start < previous_end {
177            return false;
178        }
179
180        for pos in previous_end..next_start {
181            bridge_positions.insert(pos);
182        }
183    }
184
185    let container_only_boundary_positions = container_only_positions
186        .iter()
187        .filter(|&pos| !bridge_positions.contains(pos))
188        .count();
189
190    if container_only_positions.len() == 1
191        && container_only_boundary_positions == 0
192        && child_only_positions.is_empty()
193    {
194        return false;
195    }
196
197    if child_only_positions.is_empty()
198        && container_only_positions.len() == container_only_boundary_positions
199        && container_only_boundary_positions <= 3
200    {
201        let earliest_child = contained
202            .iter()
203            .map(|m| m.qspan_bounds().0)
204            .min()
205            .unwrap_or(usize::MAX);
206        let latest_child = contained
207            .iter()
208            .map(|m| m.qspan_bounds().1.saturating_sub(1))
209            .max()
210            .unwrap_or(0);
211
212        let is_one_sided_boundary = container_only_positions
213            .iter()
214            .all(|pos| pos < earliest_child)
215            || container_only_positions
216                .iter()
217                .all(|pos| pos > latest_child);
218
219        if is_one_sided_boundary {
220            return false;
221        }
222    }
223
224    let max_container_only_positions =
225        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * contained.len() + 1;
226    let max_container_boundary_positions =
227        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * (contained.len() - 1);
228    let max_child_only_positions = MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP + 1;
229
230    container_only_positions.len() <= max_container_only_positions
231        && container_only_boundary_positions <= max_container_boundary_positions
232        && child_only_positions.len() <= max_child_only_positions
233}
234
235fn filter_redundant_same_expression_seq_containers(
236    seq_matches: Vec<LicenseMatch>,
237    candidate_contained_matches: &[LicenseMatch],
238) -> Vec<LicenseMatch> {
239    seq_matches
240        .into_iter()
241        .filter(|m| !is_redundant_same_expression_seq_container(m, candidate_contained_matches))
242        .collect()
243}
244
245fn is_redundant_low_coverage_composite_seq_wrapper(
246    container: &LicenseMatch,
247    candidate_contained_matches: &[LicenseMatch],
248) -> bool {
249    if container.matcher != seq_match::MATCH_SEQ || container.coverage() >= 30.0 {
250        return false;
251    }
252
253    let container_qspan_set = container.qspan_set();
254
255    let children: Vec<&LicenseMatch> = candidate_contained_matches
256        .iter()
257        .filter(|m| {
258            m.matcher == aho_match::MATCH_AHO
259                && has_full_match_coverage(m)
260                && m.license_expression != container.license_expression
261                && m.overlaps_with(&container_qspan_set)
262        })
263        .collect();
264
265    if children.len() < 2 {
266        return false;
267    }
268
269    let unique_expressions: HashSet<&str> = children
270        .iter()
271        .map(|m| m.license_expression.as_str())
272        .collect();
273    if unique_expressions.len() < 2 {
274        return false;
275    }
276
277    let mut child_union = PositionSet::new();
278    for m in &children {
279        child_union.extend_from_span(m.query_span());
280    }
281
282    let container_only_positions = container_qspan_set.difference(&child_union);
283    let child_only_positions = child_union.difference(&container_qspan_set);
284
285    let mut sorted_children = children;
286    sorted_children.sort_by_key(|m| m.qspan_bounds());
287
288    let mut bridge_positions = BitSet::new();
289    for pair in sorted_children.windows(2) {
290        let (_, previous_end) = pair[0].qspan_bounds();
291        let (next_start, _) = pair[1].qspan_bounds();
292        for pos in previous_end..next_start {
293            bridge_positions.insert(pos);
294        }
295    }
296
297    let container_only_boundary_positions = container_only_positions
298        .iter()
299        .filter(|&pos| !bridge_positions.contains(pos))
300        .count();
301
302    child_only_positions.is_empty()
303        && container_only_positions.len() <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
304        && container_only_boundary_positions <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
305}
306
307fn filter_redundant_low_coverage_composite_seq_wrappers(
308    seq_matches: Vec<LicenseMatch>,
309    candidate_contained_matches: &[LicenseMatch],
310) -> Vec<LicenseMatch> {
311    seq_matches
312        .into_iter()
313        .filter(|m| {
314            !is_redundant_low_coverage_composite_seq_wrapper(m, candidate_contained_matches)
315        })
316        .collect()
317}
318
319fn subtract_spdx_match_qspans(
320    query: &mut Query<'_>,
321    matched_qspans: &mut Vec<models::PositionSpan>,
322    aho_extra_matchables: &mut PositionSet,
323    spdx_matches: &[LicenseMatch],
324) {
325    for m in spdx_matches {
326        let Some(span) = query_span_for_match(m) else {
327            continue;
328        };
329
330        aho_extra_matchables.extend_from_span(&span);
331        query.subtract(&span);
332
333        if has_full_match_coverage(m) {
334            matched_qspans.push(span);
335        }
336    }
337}
338
339fn merge_and_prepare_aho_matches(
340    index: &index::LicenseIndex,
341    query: &mut Query<'_>,
342    matched_qspans: &mut Vec<models::PositionSpan>,
343    refined_aho: &[LicenseMatch],
344) -> (Vec<LicenseMatch>, bool) {
345    let merged_aho = merge_overlapping_matches(refined_aho);
346    let mut saw_long_exact_license_text_match = false;
347
348    for m in &merged_aho {
349        let Some(span) = query_span_for_match(m) else {
350            continue;
351        };
352
353        if has_full_match_coverage(m) {
354            matched_qspans.push(span.clone());
355        }
356
357        if index
358            .rules_by_rid
359            .get(m.rid)
360            .is_some_and(|rule| rule.is_license_text())
361            && m.rule_length > 120
362            && m.coverage() > 98.0
363        {
364            query.subtract(&span);
365            saw_long_exact_license_text_match = true;
366        }
367    }
368
369    (merged_aho, saw_long_exact_license_text_match)
370}
371
372fn collect_whole_query_exact_followup_matches(
373    index: &index::LicenseIndex,
374    query: &mut Query<'_>,
375    matched_qspans: &mut Vec<models::PositionSpan>,
376    whole_run: &query::QueryRun<'_>,
377) -> Vec<LicenseMatch> {
378    let mut seq_all_matches = Vec::new();
379
380    if whole_run.is_matchable(false, matched_qspans) {
381        let near_dupe_candidates =
382            select_seq_candidates(index, whole_run, true, MAX_NEAR_DUPE_CANDIDATES);
383
384        if !near_dupe_candidates.is_empty() {
385            let near_dupe_matches =
386                seq_match_with_candidates(index, whole_run, &near_dupe_candidates);
387
388            for m in &near_dupe_matches {
389                if !m.query_span().is_empty() {
390                    let span = m.query_span().clone();
391                    query.subtract(&span);
392                    matched_qspans.push(span);
393                }
394            }
395
396            seq_all_matches.extend(near_dupe_matches);
397        }
398    }
399
400    seq_all_matches
401}
402
403fn collect_regular_seq_matches(
404    index: &index::LicenseIndex,
405    query: &Query<'_>,
406    matched_qspans: &[models::PositionSpan],
407    candidate_contained_matches: &[LicenseMatch],
408) -> Vec<LicenseMatch> {
409    let mut seq_all_matches = Vec::new();
410
411    for query_run in query.query_runs() {
412        if !query_run.is_matchable(false, matched_qspans) {
413            continue;
414        }
415
416        let candidates =
417            select_seq_candidates(index, &query_run, false, MAX_REGULAR_SEQ_CANDIDATES);
418        if !candidates.is_empty() {
419            let matches = seq_match_with_candidates(index, &query_run, &candidates);
420            seq_all_matches.extend(matches);
421        }
422    }
423
424    let merged_seq = merge_overlapping_matches(&seq_all_matches);
425    let filtered_same_expression =
426        filter_redundant_same_expression_seq_containers(merged_seq, candidate_contained_matches);
427    filter_redundant_low_coverage_composite_seq_wrappers(
428        filtered_same_expression,
429        candidate_contained_matches,
430    )
431}
432
433impl LicenseDetectionEngine {
434    /// Create a new license detection engine from a pre-built license index.
435    ///
436    /// This is an internal constructor used by `from_directory()` and `from_embedded()`.
437    /// It builds the SPDX mapping from the licenses in the index.
438    fn from_index(index: index::LicenseIndex) -> Result<Self> {
439        let mut license_vec: Vec<_> = index.licenses_by_key.values().cloned().collect();
440        license_vec.sort_by(|a, b| a.key.cmp(&b.key));
441        let spdx_mapping = build_spdx_mapping(&license_vec);
442
443        Ok(Self {
444            index: Arc::new(index),
445            spdx_mapping,
446        })
447    }
448
449    /// Create a new license detection engine from the embedded license index.
450    ///
451    /// This method loads the build-time embedded license artifact and constructs
452    /// the runtime license index. This eliminates the runtime dependency on the
453    /// ScanCode rules directory.
454    ///
455    /// # Returns
456    /// A Result containing the engine or an error
457    pub fn from_embedded() -> Result<Self> {
458        let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
459        let index = load_license_index_from_bytes(artifact_bytes)
460            .map_err(|e| anyhow::anyhow!("Failed to load embedded license index: {}", e))?;
461        Self::from_index(index)
462    }
463
464    /// Create a new license detection engine from a directory of license rules.
465    ///
466    /// # Arguments
467    /// * `rules_path` - Path to directory containing .LICENSE and .RULE files
468    ///
469    /// # Returns
470    /// A Result containing the engine or an error
471    pub fn from_directory(rules_path: &Path) -> Result<Self> {
472        let (rules_dir, licenses_dir) = if rules_path.ends_with("data") {
473            (rules_path.join("rules"), rules_path.join("licenses"))
474        } else if rules_path.ends_with("rules") {
475            let parent = rules_path.parent().ok_or_else(|| {
476                anyhow::anyhow!("Cannot determine parent directory for rules path")
477            })?;
478            (rules_path.to_path_buf(), parent.join("licenses"))
479        } else {
480            (rules_path.to_path_buf(), rules_path.to_path_buf())
481        };
482
483        let loaded_rules = load_loaded_rules_from_directory(&rules_dir)?;
484        let loaded_licenses = load_loaded_licenses_from_directory(&licenses_dir)?;
485        let index = build_index_from_loaded(loaded_rules, loaded_licenses, false);
486
487        Self::from_index(index)
488    }
489
490    pub fn detect_with_kind(
491        &self,
492        text: &str,
493        unknown_licenses: bool,
494        binary_derived: bool,
495    ) -> Result<Vec<LicenseDetection>> {
496        self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, 0.0)
497    }
498
499    pub fn detect_with_kind_with_score(
500        &self,
501        text: &str,
502        unknown_licenses: bool,
503        binary_derived: bool,
504        min_score: f32,
505    ) -> Result<Vec<LicenseDetection>> {
506        let clean_text = strip_utf8_bom_str(text);
507
508        let content = truncate_detection_text(clean_text);
509
510        let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
511        let whole_query_run = query.whole_query_run();
512
513        let mut all_matches = Vec::new();
514        let mut candidate_contained_matches = Vec::new();
515        let mut aho_extra_matchables = PositionSet::new();
516        let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
517
518        // Phase 1a: Hash matching
519        // Python returns immediately if hash matches found (index.py:987-991)
520        {
521            let hash_matches = hash_match(&self.index, &whole_query_run);
522
523            if !hash_matches.is_empty() {
524                let mut matches = hash_matches;
525                sort_matches_by_line(&mut matches);
526
527                let groups = group_matches_by_region(&matches);
528                let detections: Vec<LicenseDetection> = groups
529                    .iter()
530                    .map(|group| {
531                        let mut detection = empty_detection();
532                        populate_detection_from_group_with_spdx(
533                            &mut detection,
534                            group,
535                            &self.spdx_mapping,
536                            Some(content),
537                        );
538                        detection
539                    })
540                    .collect();
541
542                return Ok(post_process_detections(detections, min_score));
543            }
544        }
545
546        // Phase 1b: SPDX-LID matching
547        {
548            let spdx_matches = spdx_lid_match(&self.index, &query);
549            subtract_spdx_match_qspans(
550                &mut query,
551                &mut matched_qspans,
552                &mut aho_extra_matchables,
553                &spdx_matches,
554            );
555            all_matches.extend(spdx_matches);
556        }
557
558        // Phase 1c: Aho-Corasick matching
559        {
560            let aho_matches = if aho_extra_matchables.is_empty() {
561                aho_match(&self.index, &whole_query_run)
562            } else {
563                aho_match::aho_match_with_extra_matchables(
564                    &self.index,
565                    &whole_query_run,
566                    Some(&aho_extra_matchables),
567                )
568            };
569
570            // Python's get_exact_matches() calls refine_matches with merge=False
571            // This applies quality filters including required phrase filtering
572            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
573            candidate_contained_matches.extend(refined_aho.clone());
574            let (merged_aho, _) = merge_and_prepare_aho_matches(
575                &self.index,
576                &mut query,
577                &mut matched_qspans,
578                &refined_aho,
579            );
580            all_matches.extend(merged_aho);
581
582            let whole_query_followup = collect_whole_query_exact_followup_matches(
583                &self.index,
584                &mut query,
585                &mut matched_qspans,
586                &whole_query_run,
587            );
588            all_matches.extend(whole_query_followup);
589
590            let merged_seq = collect_regular_seq_matches(
591                &self.index,
592                &query,
593                &matched_qspans,
594                &candidate_contained_matches,
595            );
596            all_matches.extend(merged_seq);
597        }
598
599        // Step 1: Initial refine WITHOUT false positive filtering
600        // Python: refine_matches with filter_false_positive=False (index.py:1073-1080)
601        let merged_matches =
602            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
603
604        // Step 2: Unknown detection and weak match handling
605        // Python: index.py:1079-1118 - only runs when unknown_licenses=True
606        let refined_matches = if unknown_licenses {
607            // Split weak from good - Python: index.py:1083
608            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
609
610            // Unknown detection on uncovered regions - Python: index.py:1093-1114
611            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
612            let filtered_unknown =
613                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
614
615            let mut all_matches = good_matches;
616            all_matches.extend(filtered_unknown);
617            // reinject weak matches and let refine matches keep the bests
618            // Python: index.py:1117-1118
619            all_matches.extend(weak_matches);
620            all_matches
621        } else {
622            merged_matches
623        };
624
625        // Step 5: Final refine WITH false positive filtering - Python: index.py:1130-1145
626        let refined = refine_matches(&self.index, refined_matches, &query);
627
628        let mut sorted = refined;
629        sort_matches_by_line(&mut sorted);
630
631        let groups = group_matches_by_region(&sorted);
632
633        let detections: Vec<LicenseDetection> = groups
634            .iter()
635            .map(|group| {
636                let mut detection = empty_detection();
637                populate_detection_from_group_with_spdx(
638                    &mut detection,
639                    group,
640                    &self.spdx_mapping,
641                    Some(content),
642                );
643                detection
644            })
645            .collect();
646
647        let detections = post_process_detections(detections, min_score);
648
649        Ok(detections)
650    }
651
652    pub fn detect_with_kind_and_source(
653        &self,
654        text: &str,
655        unknown_licenses: bool,
656        binary_derived: bool,
657        source_path: &str,
658    ) -> Result<Vec<LicenseDetection>> {
659        let mut detections = self.detect_with_kind(text, unknown_licenses, binary_derived)?;
660        attach_source_path_to_detections(&mut detections, source_path);
661        Ok(detections)
662    }
663
664    pub fn detect_with_kind_and_source_with_score(
665        &self,
666        text: &str,
667        unknown_licenses: bool,
668        binary_derived: bool,
669        source_path: &str,
670        min_score: f32,
671    ) -> Result<Vec<LicenseDetection>> {
672        let mut detections =
673            self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, min_score)?;
674        attach_source_path_to_detections(&mut detections, source_path);
675        Ok(detections)
676    }
677
678    /// Detect licenses and return raw matches (like Python's idx.match()).
679    ///
680    /// This is primarily used by golden tests and maintenance tooling that need
681    /// raw match sequences before grouping or post-processing into detections.
682    #[cfg(any(test, feature = "golden-tests"))]
683    pub fn detect_matches_with_kind(
684        &self,
685        text: &str,
686        unknown_licenses: bool,
687        binary_derived: bool,
688    ) -> Result<Vec<LicenseMatch>> {
689        let clean_text = strip_utf8_bom_str(text);
690
691        let content = truncate_detection_text(clean_text);
692
693        let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
694        let whole_query_run = query.whole_query_run();
695
696        let mut all_matches = Vec::new();
697        let mut candidate_contained_matches = Vec::new();
698        let mut aho_extra_matchables = PositionSet::new();
699        let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
700
701        // Phase 1a: Hash matching
702        {
703            let hash_matches = hash_match(&self.index, &whole_query_run);
704
705            if !hash_matches.is_empty() {
706                let mut matches = hash_matches;
707                sort_matches_by_line(&mut matches);
708                return Ok(matches);
709            }
710        }
711
712        // Phase 1b: SPDX-LID matching
713        {
714            let spdx_matches = spdx_lid_match(&self.index, &query);
715            subtract_spdx_match_qspans(
716                &mut query,
717                &mut matched_qspans,
718                &mut aho_extra_matchables,
719                &spdx_matches,
720            );
721            all_matches.extend(spdx_matches);
722        }
723
724        // Phase 1c: Aho-Corasick matching
725        {
726            let aho_matches = if aho_extra_matchables.is_empty() {
727                aho_match(&self.index, &whole_query_run)
728            } else {
729                aho_match::aho_match_with_extra_matchables(
730                    &self.index,
731                    &whole_query_run,
732                    Some(&aho_extra_matchables),
733                )
734            };
735            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
736            candidate_contained_matches.extend(refined_aho.clone());
737            let (merged_aho, _) = merge_and_prepare_aho_matches(
738                &self.index,
739                &mut query,
740                &mut matched_qspans,
741                &refined_aho,
742            );
743            all_matches.extend(merged_aho);
744
745            let whole_query_followup = collect_whole_query_exact_followup_matches(
746                &self.index,
747                &mut query,
748                &mut matched_qspans,
749                &whole_query_run,
750            );
751            all_matches.extend(whole_query_followup);
752
753            let merged_seq = collect_regular_seq_matches(
754                &self.index,
755                &query,
756                &matched_qspans,
757                &candidate_contained_matches,
758            );
759            all_matches.extend(merged_seq);
760        }
761
762        // Step 1: Initial refine WITHOUT false positive filtering
763        let merged_matches =
764            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
765
766        // Step 2: Unknown detection and weak match handling
767        let refined_matches = if unknown_licenses {
768            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
769            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
770            let filtered_unknown =
771                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
772
773            let mut all_matches = good_matches;
774            all_matches.extend(filtered_unknown);
775            all_matches.extend(weak_matches);
776            all_matches
777        } else {
778            merged_matches
779        };
780
781        // Step 3: Final refine WITH false positive filtering - Python: index.py:1130-1145
782        let refined = refine_matches(&self.index, refined_matches, &query);
783
784        let mut sorted = refined;
785        sort_matches_by_line(&mut sorted);
786
787        // Return raw matches (NOT grouped) - this is Python's idx.match() behavior
788        Ok(sorted)
789    }
790
791    /// Get a reference to the license index.
792    pub fn index(&self) -> &index::LicenseIndex {
793        &self.index
794    }
795
796    /// Get a reference to the SPDX mapping.
797    #[cfg(test)]
798    pub fn spdx_mapping(&self) -> &SpdxMapping {
799        &self.spdx_mapping
800    }
801}
802
803#[cfg(test)]
804mod tests;