Skip to main content

provenant/license_detection/
mod.rs

1//! License Detection Engine
2
3pub mod aho_match;
4pub mod automaton;
5pub(crate) mod detection;
6pub mod embedded;
7mod position_set;
8mod token_set;
9
10#[cfg(test)]
11mod embedded_test;
12pub mod expression;
13#[cfg(test)]
14mod golden_test;
15pub mod hash_match;
16pub mod index;
17mod match_refine;
18pub mod models;
19pub mod query;
20pub mod rules;
21pub mod seq_match;
22pub mod spans;
23pub mod spdx_lid;
24pub mod spdx_mapping;
25#[cfg(test)]
26mod test_utils;
27pub mod tokenize;
28pub mod unknown_match;
29
30use bit_set::BitSet;
31use std::collections::HashSet;
32use std::path::Path;
33use std::sync::Arc;
34
35use anyhow::Result;
36
37use crate::license_detection::embedded::index::load_license_index_from_bytes;
38use crate::license_detection::index::build_index_from_loaded;
39use crate::license_detection::query::Query;
40use crate::license_detection::rules::{
41    load_loaded_licenses_from_directory, load_loaded_rules_from_directory,
42};
43use crate::license_detection::spdx_mapping::{SpdxMapping, build_spdx_mapping};
44use crate::utils::text::strip_utf8_bom_str;
45
46use crate::license_detection::detection::{
47    attach_source_path_to_detections, empty_detection, populate_detection_from_group_with_spdx,
48};
49use crate::license_detection::models::MatcherKind;
50
51/// Path to the license rules directory in the reference scancode-toolkit submodule.
52/// Used by test code and the xtask generate-license-loader-artifact binary.
53#[allow(dead_code)]
54pub const SCANCODE_LICENSES_RULES_PATH: &str =
55    "reference/scancode-toolkit/src/licensedcode/data/rules";
56
57/// Path to the licenses directory in the reference scancode-toolkit submodule.
58/// Used by test code and the xtask generate-license-loader-artifact binary.
59#[allow(dead_code)]
60pub const SCANCODE_LICENSES_LICENSES_PATH: &str =
61    "reference/scancode-toolkit/src/licensedcode/data/licenses";
62
63/// Path to the license data directory in the reference scancode-toolkit submodule.
64/// Used by test code and the xtask generate-license-loader-artifact binary.
65#[allow(dead_code)]
66pub const SCANCODE_LICENSES_DATA_PATH: &str = "reference/scancode-toolkit/src/licensedcode/data";
67
68pub const DEFAULT_LICENSEDB_URL_TEMPLATE: &str = "https://scancode-licensedb.aboutcode.org/{}";
69
70pub(crate) use detection::{
71    LicenseDetection, group_matches_by_region, post_process_detections, sort_matches_by_line,
72};
73pub use models::LicenseMatch;
74
75pub use aho_match::aho_match;
76pub use hash_match::hash_match;
77pub use match_refine::{
78    filter_invalid_contained_unknown_matches, merge_overlapping_matches, refine_matches,
79    refine_matches_without_false_positive_filter, split_weak_matches,
80};
81pub use position_set::PositionSet;
82pub use seq_match::{
83    MAX_NEAR_DUPE_CANDIDATES, compute_candidates_with_msets, seq_match_with_candidates,
84};
85pub use spdx_lid::spdx_lid_match;
86pub use token_set::TokenSet;
87pub use unknown_match::unknown_match;
88
89/// License detection engine that orchestrates the detection pipeline.
90///
91/// The engine loads license rules and builds an index for efficient matching.
92/// It supports multiple matching strategies (hash, SPDX-LID, Aho-Corasick, sequence)
93/// and combines their results into final license detections.
94#[derive(Debug, Clone)]
95pub struct LicenseDetectionEngine {
96    index: Arc<index::LicenseIndex>,
97    spdx_mapping: SpdxMapping,
98}
99
100const MAX_DETECTION_SIZE: usize = 10 * 1024 * 1024; // 10MB
101const MAX_REGULAR_SEQ_CANDIDATES: usize = 70;
102const MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP: usize = 8;
103const MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP: usize = 2;
104
105fn truncate_detection_text(clean_text: &str) -> &str {
106    if clean_text.len() <= MAX_DETECTION_SIZE {
107        return clean_text;
108    }
109
110    log::warn!(
111        "Content size {} exceeds limit {}, truncating for detection",
112        clean_text.len(),
113        MAX_DETECTION_SIZE
114    );
115
116    let boundary = clean_text.floor_char_boundary(MAX_DETECTION_SIZE);
117    &clean_text[..boundary]
118}
119
120fn query_span_for_match(m: &LicenseMatch) -> Option<query::PositionSpan> {
121    (m.end_token > m.start_token).then(|| query::PositionSpan::new(m.start_token, m.end_token - 1))
122}
123
124fn has_full_match_coverage(m: &LicenseMatch) -> bool {
125    m.coverage() == 100.0
126}
127
128fn is_redundant_same_expression_seq_container(
129    container: &LicenseMatch,
130    candidate_contained_matches: &[LicenseMatch],
131) -> bool {
132    let container_is_redundant_coverage =
133        has_full_match_coverage(container) || container.coverage() >= 99.0;
134    if container.matcher != MatcherKind::Seq || !container_is_redundant_coverage {
135        return false;
136    }
137
138    let container_qspan_set: PositionSet = container.qspan().into_iter().collect();
139
140    let mut contained: Vec<(&LicenseMatch, Vec<usize>)> = candidate_contained_matches
141        .iter()
142        .filter_map(|m| {
143            if m.matcher == MatcherKind::Aho
144                && has_full_match_coverage(m)
145                && m.license_expression == container.license_expression
146                && m.overlaps_with(&container_qspan_set)
147            {
148                Some((m, m.qspan()))
149            } else {
150                None
151            }
152        })
153        .collect();
154
155    if contained.len() < 2 {
156        return false;
157    }
158
159    let material_children = contained
160        .iter()
161        .filter(|(m, _)| m.matched_length > 1)
162        .count();
163    if material_children < 2 {
164        return false;
165    }
166
167    contained.sort_by_key(|(m, _)| m.qspan_bounds());
168
169    let mut child_union = PositionSet::new();
170    for (_, qspan) in &contained {
171        for &pos in qspan {
172            child_union.insert(pos);
173        }
174    }
175
176    let container_only_positions = container_qspan_set.difference(&child_union);
177    let child_only_positions = child_union.difference(&container_qspan_set);
178
179    let mut bridge_positions = BitSet::new();
180    for pair in contained.windows(2) {
181        let (_, previous_end) = pair[0].0.qspan_bounds();
182        let (next_start, _) = pair[1].0.qspan_bounds();
183
184        if next_start < previous_end {
185            return false;
186        }
187
188        for pos in previous_end..next_start {
189            bridge_positions.insert(pos);
190        }
191    }
192
193    let container_only_boundary_positions = container_only_positions
194        .iter()
195        .filter(|&pos| !bridge_positions.contains(pos))
196        .count();
197
198    if container_only_positions.len() == 1
199        && container_only_boundary_positions == 0
200        && child_only_positions.is_empty()
201    {
202        return false;
203    }
204
205    if child_only_positions.is_empty()
206        && container_only_positions.len() == container_only_boundary_positions
207        && container_only_boundary_positions <= 3
208    {
209        let earliest_child = contained
210            .iter()
211            .map(|(m, _)| m.qspan_bounds().0)
212            .min()
213            .unwrap_or(usize::MAX);
214        let latest_child = contained
215            .iter()
216            .map(|(m, _)| m.qspan_bounds().1.saturating_sub(1))
217            .max()
218            .unwrap_or(0);
219
220        let is_one_sided_boundary = container_only_positions
221            .iter()
222            .all(|pos| pos < earliest_child)
223            || container_only_positions
224                .iter()
225                .all(|pos| pos > latest_child);
226
227        if is_one_sided_boundary {
228            return false;
229        }
230    }
231
232    let max_container_only_positions =
233        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * contained.len() + 1;
234    let max_container_boundary_positions =
235        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * (contained.len() - 1);
236    let max_child_only_positions = MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP + 1;
237
238    container_only_positions.len() <= max_container_only_positions
239        && container_only_boundary_positions <= max_container_boundary_positions
240        && child_only_positions.len() <= max_child_only_positions
241}
242
243fn filter_redundant_same_expression_seq_containers(
244    seq_matches: Vec<LicenseMatch>,
245    candidate_contained_matches: &[LicenseMatch],
246) -> Vec<LicenseMatch> {
247    seq_matches
248        .into_iter()
249        .filter(|m| !is_redundant_same_expression_seq_container(m, candidate_contained_matches))
250        .collect()
251}
252
253fn is_redundant_low_coverage_composite_seq_wrapper(
254    container: &LicenseMatch,
255    candidate_contained_matches: &[LicenseMatch],
256) -> bool {
257    if container.matcher != seq_match::MATCH_SEQ || container.coverage() >= 30.0 {
258        return false;
259    }
260
261    let container_qspan_set: PositionSet = container.qspan().into_iter().collect();
262
263    let children: Vec<(&LicenseMatch, Vec<usize>)> = candidate_contained_matches
264        .iter()
265        .filter_map(|m| {
266            if m.matcher == aho_match::MATCH_AHO
267                && has_full_match_coverage(m)
268                && m.license_expression != container.license_expression
269                && m.overlaps_with(&container_qspan_set)
270            {
271                Some((m, m.qspan()))
272            } else {
273                None
274            }
275        })
276        .collect();
277
278    if children.len() < 2 {
279        return false;
280    }
281
282    let unique_expressions: HashSet<&str> = children
283        .iter()
284        .map(|(m, _)| m.license_expression.as_str())
285        .collect();
286    if unique_expressions.len() < 2 {
287        return false;
288    }
289
290    let mut child_union = PositionSet::new();
291    for (_, qspan) in &children {
292        for &pos in qspan {
293            child_union.insert(pos);
294        }
295    }
296
297    let container_only_positions = container_qspan_set.difference(&child_union);
298    let child_only_positions = child_union.difference(&container_qspan_set);
299
300    let mut sorted_children = children;
301    sorted_children.sort_by_key(|(m, _)| m.qspan_bounds());
302
303    let mut bridge_positions = BitSet::new();
304    for pair in sorted_children.windows(2) {
305        let (_, previous_end) = pair[0].0.qspan_bounds();
306        let (next_start, _) = pair[1].0.qspan_bounds();
307        for pos in previous_end..next_start {
308            bridge_positions.insert(pos);
309        }
310    }
311
312    let container_only_boundary_positions = container_only_positions
313        .iter()
314        .filter(|&pos| !bridge_positions.contains(pos))
315        .count();
316
317    child_only_positions.is_empty()
318        && container_only_positions.len() <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
319        && container_only_boundary_positions <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
320}
321
322fn filter_redundant_low_coverage_composite_seq_wrappers(
323    seq_matches: Vec<LicenseMatch>,
324    candidate_contained_matches: &[LicenseMatch],
325) -> Vec<LicenseMatch> {
326    seq_matches
327        .into_iter()
328        .filter(|m| {
329            !is_redundant_low_coverage_composite_seq_wrapper(m, candidate_contained_matches)
330        })
331        .collect()
332}
333
334fn subtract_spdx_match_qspans(
335    query: &mut Query<'_>,
336    matched_qspans: &mut Vec<query::PositionSpan>,
337    aho_extra_matchables: &mut BitSet,
338    spdx_matches: &[LicenseMatch],
339) {
340    for m in spdx_matches {
341        let Some(span) = query_span_for_match(m) else {
342            continue;
343        };
344
345        for pos in span.iter() {
346            aho_extra_matchables.insert(pos);
347        }
348        query.subtract(&span);
349
350        if has_full_match_coverage(m) {
351            matched_qspans.push(span);
352        }
353    }
354}
355
356fn merge_and_prepare_aho_matches(
357    index: &index::LicenseIndex,
358    query: &mut Query<'_>,
359    matched_qspans: &mut Vec<query::PositionSpan>,
360    refined_aho: &[LicenseMatch],
361) -> (Vec<LicenseMatch>, bool) {
362    let merged_aho = merge_overlapping_matches(refined_aho);
363    let mut saw_long_exact_license_text_match = false;
364
365    for m in &merged_aho {
366        let Some(span) = query_span_for_match(m) else {
367            continue;
368        };
369
370        if has_full_match_coverage(m) {
371            matched_qspans.push(span.clone());
372        }
373
374        if index
375            .rules_by_rid
376            .get(m.rid)
377            .is_some_and(|rule| rule.is_license_text())
378            && m.rule_length > 120
379            && m.coverage() > 98.0
380        {
381            query.subtract(&span);
382            saw_long_exact_license_text_match = true;
383        }
384    }
385
386    (merged_aho, saw_long_exact_license_text_match)
387}
388
389fn collect_whole_query_exact_followup_matches(
390    index: &index::LicenseIndex,
391    query: &mut Query<'_>,
392    matched_qspans: &mut Vec<query::PositionSpan>,
393    whole_run: &query::QueryRun<'_>,
394) -> Vec<LicenseMatch> {
395    let mut seq_all_matches = Vec::new();
396
397    if whole_run.is_matchable(false, matched_qspans) {
398        let near_dupe_candidates =
399            compute_candidates_with_msets(index, whole_run, true, MAX_NEAR_DUPE_CANDIDATES);
400
401        if !near_dupe_candidates.is_empty() {
402            let near_dupe_matches =
403                seq_match_with_candidates(index, whole_run, &near_dupe_candidates);
404
405            for m in &near_dupe_matches {
406                if m.end_token > m.start_token {
407                    let span = query::PositionSpan::new(m.start_token, m.end_token - 1);
408                    query.subtract(&span);
409                    matched_qspans.push(span);
410                }
411            }
412
413            seq_all_matches.extend(near_dupe_matches);
414        }
415    }
416
417    seq_all_matches
418}
419
420fn collect_regular_seq_matches(
421    index: &index::LicenseIndex,
422    query: &Query<'_>,
423    matched_qspans: &[query::PositionSpan],
424    candidate_contained_matches: &[LicenseMatch],
425) -> Vec<LicenseMatch> {
426    let mut seq_all_matches = Vec::new();
427
428    for query_run in query.query_runs() {
429        if !query_run.is_matchable(false, matched_qspans) {
430            continue;
431        }
432
433        let candidates =
434            compute_candidates_with_msets(index, &query_run, false, MAX_REGULAR_SEQ_CANDIDATES);
435        if !candidates.is_empty() {
436            let matches = seq_match_with_candidates(index, &query_run, &candidates);
437            seq_all_matches.extend(matches);
438        }
439    }
440
441    let merged_seq = merge_overlapping_matches(&seq_all_matches);
442    let filtered_same_expression =
443        filter_redundant_same_expression_seq_containers(merged_seq, candidate_contained_matches);
444    filter_redundant_low_coverage_composite_seq_wrappers(
445        filtered_same_expression,
446        candidate_contained_matches,
447    )
448}
449
450impl LicenseDetectionEngine {
451    /// Create a new license detection engine from a pre-built license index.
452    ///
453    /// This is an internal constructor used by `from_directory()` and `from_embedded()`.
454    /// It builds the SPDX mapping from the licenses in the index.
455    fn from_index(index: index::LicenseIndex) -> Result<Self> {
456        let mut license_vec: Vec<_> = index.licenses_by_key.values().cloned().collect();
457        license_vec.sort_by(|a, b| a.key.cmp(&b.key));
458        let spdx_mapping = build_spdx_mapping(&license_vec);
459
460        Ok(Self {
461            index: Arc::new(index),
462            spdx_mapping,
463        })
464    }
465
466    /// Create a new license detection engine from the embedded license index.
467    ///
468    /// This method loads the build-time embedded license artifact and constructs
469    /// the runtime license index. This eliminates the runtime dependency on the
470    /// ScanCode rules directory.
471    ///
472    /// # Returns
473    /// A Result containing the engine or an error
474    pub fn from_embedded() -> Result<Self> {
475        let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
476        let index = load_license_index_from_bytes(artifact_bytes)
477            .map_err(|e| anyhow::anyhow!("Failed to load embedded license index: {}", e))?;
478        Self::from_index(index)
479    }
480
481    /// Create a new license detection engine from a directory of license rules.
482    ///
483    /// # Arguments
484    /// * `rules_path` - Path to directory containing .LICENSE and .RULE files
485    ///
486    /// # Returns
487    /// A Result containing the engine or an error
488    pub fn from_directory(rules_path: &Path) -> Result<Self> {
489        let (rules_dir, licenses_dir) = if rules_path.ends_with("data") {
490            (rules_path.join("rules"), rules_path.join("licenses"))
491        } else if rules_path.ends_with("rules") {
492            let parent = rules_path.parent().ok_or_else(|| {
493                anyhow::anyhow!("Cannot determine parent directory for rules path")
494            })?;
495            (rules_path.to_path_buf(), parent.join("licenses"))
496        } else {
497            (rules_path.to_path_buf(), rules_path.to_path_buf())
498        };
499
500        let loaded_rules = load_loaded_rules_from_directory(&rules_dir)?;
501        let loaded_licenses = load_loaded_licenses_from_directory(&licenses_dir)?;
502        let index = build_index_from_loaded(loaded_rules, loaded_licenses, false);
503
504        Self::from_index(index)
505    }
506
507    pub fn detect_with_kind(
508        &self,
509        text: &str,
510        unknown_licenses: bool,
511        binary_derived: bool,
512    ) -> Result<Vec<LicenseDetection>> {
513        self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, 0.0)
514    }
515
516    pub fn detect_with_kind_with_score(
517        &self,
518        text: &str,
519        unknown_licenses: bool,
520        binary_derived: bool,
521        min_score: f32,
522    ) -> Result<Vec<LicenseDetection>> {
523        let clean_text = strip_utf8_bom_str(text);
524
525        let content = truncate_detection_text(clean_text);
526
527        let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
528        let whole_query_run = query.whole_query_run();
529
530        let mut all_matches = Vec::new();
531        let mut candidate_contained_matches = Vec::new();
532        let mut aho_extra_matchables = BitSet::new();
533        let mut matched_qspans: Vec<query::PositionSpan> = Vec::new();
534
535        // Phase 1a: Hash matching
536        // Python returns immediately if hash matches found (index.py:987-991)
537        {
538            let hash_matches = hash_match(&self.index, &whole_query_run);
539
540            if !hash_matches.is_empty() {
541                let mut matches = hash_matches;
542                sort_matches_by_line(&mut matches);
543
544                let groups = group_matches_by_region(&matches);
545                let detections: Vec<LicenseDetection> = groups
546                    .iter()
547                    .map(|group| {
548                        let mut detection = empty_detection();
549                        populate_detection_from_group_with_spdx(
550                            &mut detection,
551                            group,
552                            &self.spdx_mapping,
553                        );
554                        detection
555                    })
556                    .collect();
557
558                return Ok(post_process_detections(detections, min_score));
559            }
560        }
561
562        // Phase 1b: SPDX-LID matching
563        {
564            let spdx_matches = spdx_lid_match(&self.index, &query);
565            let merged_spdx = merge_overlapping_matches(&spdx_matches);
566            subtract_spdx_match_qspans(
567                &mut query,
568                &mut matched_qspans,
569                &mut aho_extra_matchables,
570                &merged_spdx,
571            );
572            all_matches.extend(merged_spdx);
573        }
574
575        // Phase 1c: Aho-Corasick matching
576        {
577            let aho_matches = if aho_extra_matchables.is_empty() {
578                aho_match(&self.index, &whole_query_run)
579            } else {
580                aho_match::aho_match_with_extra_matchables(
581                    &self.index,
582                    &whole_query_run,
583                    Some(&aho_extra_matchables),
584                )
585            };
586
587            // Python's get_exact_matches() calls refine_matches with merge=False
588            // This applies quality filters including required phrase filtering
589            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
590            candidate_contained_matches.extend(refined_aho.clone());
591            let (merged_aho, _) = merge_and_prepare_aho_matches(
592                &self.index,
593                &mut query,
594                &mut matched_qspans,
595                &refined_aho,
596            );
597            all_matches.extend(merged_aho);
598
599            let whole_query_followup = collect_whole_query_exact_followup_matches(
600                &self.index,
601                &mut query,
602                &mut matched_qspans,
603                &whole_query_run,
604            );
605            all_matches.extend(whole_query_followup);
606
607            let merged_seq = collect_regular_seq_matches(
608                &self.index,
609                &query,
610                &matched_qspans,
611                &candidate_contained_matches,
612            );
613            all_matches.extend(merged_seq);
614        }
615
616        // Step 1: Initial refine WITHOUT false positive filtering
617        // Python: refine_matches with filter_false_positive=False (index.py:1073-1080)
618        let merged_matches =
619            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
620
621        // Step 2: Unknown detection and weak match handling
622        // Python: index.py:1079-1118 - only runs when unknown_licenses=True
623        let refined_matches = if unknown_licenses {
624            // Split weak from good - Python: index.py:1083
625            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
626
627            // Unknown detection on uncovered regions - Python: index.py:1093-1114
628            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
629            let filtered_unknown =
630                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
631
632            let mut all_matches = good_matches;
633            all_matches.extend(filtered_unknown);
634            // reinject weak matches and let refine matches keep the bests
635            // Python: index.py:1117-1118
636            all_matches.extend(weak_matches);
637            all_matches
638        } else {
639            merged_matches
640        };
641
642        // Step 5: Final refine WITH false positive filtering - Python: index.py:1130-1145
643        let refined = refine_matches(&self.index, refined_matches, &query);
644
645        let mut sorted = refined;
646        sort_matches_by_line(&mut sorted);
647
648        let groups = group_matches_by_region(&sorted);
649
650        let detections: Vec<LicenseDetection> = groups
651            .iter()
652            .map(|group| {
653                let mut detection = empty_detection();
654                populate_detection_from_group_with_spdx(&mut detection, group, &self.spdx_mapping);
655                detection
656            })
657            .collect();
658
659        let detections = post_process_detections(detections, min_score);
660
661        Ok(detections)
662    }
663
664    pub fn detect_with_kind_and_source(
665        &self,
666        text: &str,
667        unknown_licenses: bool,
668        binary_derived: bool,
669        source_path: &str,
670    ) -> Result<Vec<LicenseDetection>> {
671        let mut detections = self.detect_with_kind(text, unknown_licenses, binary_derived)?;
672        attach_source_path_to_detections(&mut detections, source_path);
673        Ok(detections)
674    }
675
676    pub fn detect_with_kind_and_source_with_score(
677        &self,
678        text: &str,
679        unknown_licenses: bool,
680        binary_derived: bool,
681        source_path: &str,
682        min_score: f32,
683    ) -> Result<Vec<LicenseDetection>> {
684        let mut detections =
685            self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, min_score)?;
686        attach_source_path_to_detections(&mut detections, source_path);
687        Ok(detections)
688    }
689
690    /// Detect licenses and return raw matches (like Python's idx.match()).
691    ///
692    /// This method is only used by unit/golden tests for parity checks.
693    #[cfg(test)]
694    pub fn detect_matches_with_kind(
695        &self,
696        text: &str,
697        unknown_licenses: bool,
698        binary_derived: bool,
699    ) -> Result<Vec<LicenseMatch>> {
700        let clean_text = strip_utf8_bom_str(text);
701
702        let content = truncate_detection_text(clean_text);
703
704        let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
705        let whole_query_run = query.whole_query_run();
706
707        let mut all_matches = Vec::new();
708        let mut candidate_contained_matches = Vec::new();
709        let mut aho_extra_matchables = BitSet::new();
710        let mut matched_qspans: Vec<query::PositionSpan> = Vec::new();
711
712        // Phase 1a: Hash matching
713        {
714            let hash_matches = hash_match(&self.index, &whole_query_run);
715
716            if !hash_matches.is_empty() {
717                let mut matches = hash_matches;
718                sort_matches_by_line(&mut matches);
719                return Ok(matches);
720            }
721        }
722
723        // Phase 1b: SPDX-LID matching
724        {
725            let spdx_matches = spdx_lid_match(&self.index, &query);
726            let merged_spdx = merge_overlapping_matches(&spdx_matches);
727            subtract_spdx_match_qspans(
728                &mut query,
729                &mut matched_qspans,
730                &mut aho_extra_matchables,
731                &merged_spdx,
732            );
733            all_matches.extend(merged_spdx);
734        }
735
736        // Phase 1c: Aho-Corasick matching
737        {
738            let aho_matches = if aho_extra_matchables.is_empty() {
739                aho_match(&self.index, &whole_query_run)
740            } else {
741                aho_match::aho_match_with_extra_matchables(
742                    &self.index,
743                    &whole_query_run,
744                    Some(&aho_extra_matchables),
745                )
746            };
747            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
748            candidate_contained_matches.extend(refined_aho.clone());
749            let (merged_aho, _) = merge_and_prepare_aho_matches(
750                &self.index,
751                &mut query,
752                &mut matched_qspans,
753                &refined_aho,
754            );
755            all_matches.extend(merged_aho);
756
757            let whole_query_followup = collect_whole_query_exact_followup_matches(
758                &self.index,
759                &mut query,
760                &mut matched_qspans,
761                &whole_query_run,
762            );
763            all_matches.extend(whole_query_followup);
764
765            let merged_seq = collect_regular_seq_matches(
766                &self.index,
767                &query,
768                &matched_qspans,
769                &candidate_contained_matches,
770            );
771            all_matches.extend(merged_seq);
772        }
773
774        // Step 1: Initial refine WITHOUT false positive filtering
775        let merged_matches =
776            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
777
778        // Step 2: Unknown detection and weak match handling
779        let refined_matches = if unknown_licenses {
780            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
781            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
782            let filtered_unknown =
783                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
784
785            let mut all_matches = good_matches;
786            all_matches.extend(filtered_unknown);
787            all_matches.extend(weak_matches);
788            all_matches
789        } else {
790            merged_matches
791        };
792
793        // Step 3: Final refine WITH false positive filtering - Python: index.py:1130-1145
794        let refined = refine_matches(&self.index, refined_matches, &query);
795
796        let mut sorted = refined;
797        sort_matches_by_line(&mut sorted);
798
799        // Return raw matches (NOT grouped) - this is Python's idx.match() behavior
800        Ok(sorted)
801    }
802
803    /// Get a reference to the license index.
804    pub fn index(&self) -> &index::LicenseIndex {
805        &self.index
806    }
807
808    /// Get a reference to the SPDX mapping.
809    #[cfg(test)]
810    pub fn spdx_mapping(&self) -> &SpdxMapping {
811        &self.spdx_mapping
812    }
813}
814
815#[cfg(test)]
816mod tests;