Skip to main content

provenant/license_detection/
mod.rs

1//! License Detection Engine
2
3pub mod aho_match;
4mod detection;
5pub mod embedded;
6
7#[cfg(test)]
8mod embedded_test;
9pub mod expression;
10#[cfg(test)]
11mod golden_test;
12pub mod hash_match;
13pub mod index;
14mod match_refine;
15pub mod models;
16pub mod query;
17pub mod rules;
18pub mod seq_match;
19pub mod spans;
20pub mod spdx_lid;
21pub mod spdx_mapping;
22#[cfg(test)]
23mod test_utils;
24pub mod tokenize;
25pub mod unknown_match;
26
27use bit_set::BitSet;
28use std::collections::HashSet;
29use std::path::Path;
30use std::sync::Arc;
31
32use anyhow::Result;
33
34use crate::license_detection::index::build_index_from_loaded;
35use crate::license_detection::query::Query;
36use crate::license_detection::rules::{
37    load_loaded_licenses_from_directory, load_loaded_rules_from_directory,
38};
39use crate::license_detection::spdx_mapping::{SpdxMapping, build_spdx_mapping};
40use crate::utils::text::strip_utf8_bom_str;
41
42use crate::license_detection::detection::populate_detection_from_group_with_spdx;
43use crate::license_detection::models::MatcherKind;
44
45/// Path to the license rules directory in the reference scancode-toolkit submodule.
46/// Used by test code and the xtask generate-license-loader-artifact binary.
47#[allow(dead_code)]
48pub const SCANCODE_LICENSES_RULES_PATH: &str =
49    "reference/scancode-toolkit/src/licensedcode/data/rules";
50
51/// Path to the licenses directory in the reference scancode-toolkit submodule.
52/// Used by test code and the xtask generate-license-loader-artifact binary.
53#[allow(dead_code)]
54pub const SCANCODE_LICENSES_LICENSES_PATH: &str =
55    "reference/scancode-toolkit/src/licensedcode/data/licenses";
56
57/// Path to the license data directory in the reference scancode-toolkit submodule.
58/// Used by test code and the xtask generate-license-loader-artifact binary.
59#[allow(dead_code)]
60pub const SCANCODE_LICENSES_DATA_PATH: &str = "reference/scancode-toolkit/src/licensedcode/data";
61
62pub use detection::{
63    LicenseDetection, create_detection_from_group, group_matches_by_region,
64    post_process_detections, sort_matches_by_line,
65};
66pub use models::LicenseMatch;
67
68pub use aho_match::aho_match;
69pub use hash_match::hash_match;
70pub use match_refine::{
71    filter_invalid_contained_unknown_matches, merge_overlapping_matches, refine_matches,
72    refine_matches_without_false_positive_filter, split_weak_matches,
73};
74pub use seq_match::{
75    MAX_NEAR_DUPE_CANDIDATES, compute_candidates_with_msets, seq_match_with_candidates,
76};
77pub use spdx_lid::spdx_lid_match;
78pub use unknown_match::unknown_match;
79
80/// License detection engine that orchestrates the detection pipeline.
81///
82/// The engine loads license rules and builds an index for efficient matching.
83/// It supports multiple matching strategies (hash, SPDX-LID, Aho-Corasick, sequence)
84/// and combines their results into final license detections.
85#[derive(Debug, Clone)]
86pub struct LicenseDetectionEngine {
87    index: Arc<index::LicenseIndex>,
88    spdx_mapping: SpdxMapping,
89}
90
91const MAX_DETECTION_SIZE: usize = 10 * 1024 * 1024; // 10MB
92const MAX_REGULAR_SEQ_CANDIDATES: usize = 70;
93const MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP: usize = 8;
94const MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP: usize = 2;
95
96fn query_span_for_match(m: &LicenseMatch) -> Option<query::PositionSpan> {
97    (m.end_token > m.start_token).then(|| query::PositionSpan::new(m.start_token, m.end_token - 1))
98}
99
100fn has_full_match_coverage(m: &LicenseMatch) -> bool {
101    ((m.match_coverage * 100.0).round() / 100.0) == 100.0
102}
103
104fn is_redundant_same_expression_seq_container(
105    container: &LicenseMatch,
106    candidate_contained_matches: &[LicenseMatch],
107) -> bool {
108    let container_is_redundant_coverage =
109        has_full_match_coverage(container) || container.match_coverage >= 99.0;
110    if container.matcher != MatcherKind::Seq || !container_is_redundant_coverage {
111        return false;
112    }
113
114    let container_qspan_set: BitSet = container.qspan_bitset();
115
116    let mut contained: Vec<(&LicenseMatch, Vec<usize>)> = candidate_contained_matches
117        .iter()
118        .filter_map(|m| {
119            if m.matcher == MatcherKind::Aho
120                && has_full_match_coverage(m)
121                && m.license_expression == container.license_expression
122                && (container.qcontains_with_set(m, &container_qspan_set)
123                    || container.qoverlap_with_set(m, &container_qspan_set) > 0)
124            {
125                Some((m, m.qspan()))
126            } else {
127                None
128            }
129        })
130        .collect();
131
132    if contained.len() < 2 {
133        return false;
134    }
135
136    let material_children = contained
137        .iter()
138        .filter(|(m, _)| m.matched_length > 1)
139        .count();
140    if material_children < 2 {
141        return false;
142    }
143
144    contained.sort_by_key(|(m, _)| m.qspan_bounds());
145
146    let mut child_union = BitSet::new();
147    for (_, qspan) in &contained {
148        for &pos in qspan {
149            child_union.insert(pos);
150        }
151    }
152
153    let container_only_positions: BitSet = container_qspan_set.difference(&child_union).collect();
154    let child_only_positions: BitSet = child_union.difference(&container_qspan_set).collect();
155
156    let mut bridge_positions = BitSet::new();
157    for pair in contained.windows(2) {
158        let (_, previous_end) = pair[0].0.qspan_bounds();
159        let (next_start, _) = pair[1].0.qspan_bounds();
160
161        if next_start < previous_end {
162            return false;
163        }
164
165        for pos in previous_end..next_start {
166            bridge_positions.insert(pos);
167        }
168    }
169
170    let container_only_boundary_positions = container_only_positions
171        .difference(&bridge_positions)
172        .count();
173
174    if container_only_positions.count() == 1
175        && container_only_boundary_positions == 0
176        && child_only_positions.is_empty()
177    {
178        return false;
179    }
180
181    if child_only_positions.is_empty()
182        && container_only_positions.count() == container_only_boundary_positions
183        && container_only_boundary_positions <= 3
184    {
185        let earliest_child = contained
186            .iter()
187            .map(|(m, _)| m.qspan_bounds().0)
188            .min()
189            .unwrap_or(usize::MAX);
190        let latest_child = contained
191            .iter()
192            .map(|(m, _)| m.qspan_bounds().1.saturating_sub(1))
193            .max()
194            .unwrap_or(0);
195
196        let is_one_sided_boundary = container_only_positions
197            .iter()
198            .all(|pos| pos < earliest_child)
199            || container_only_positions
200                .iter()
201                .all(|pos| pos > latest_child);
202
203        if is_one_sided_boundary {
204            return false;
205        }
206    }
207
208    let max_container_only_positions =
209        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * contained.len() + 1;
210    let max_container_boundary_positions =
211        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * (contained.len() - 1);
212    let max_child_only_positions = MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP + 1;
213
214    container_only_positions.count() <= max_container_only_positions
215        && container_only_boundary_positions <= max_container_boundary_positions
216        && child_only_positions.count() <= max_child_only_positions
217}
218
219fn filter_redundant_same_expression_seq_containers(
220    seq_matches: Vec<LicenseMatch>,
221    candidate_contained_matches: &[LicenseMatch],
222) -> Vec<LicenseMatch> {
223    seq_matches
224        .into_iter()
225        .filter(|m| !is_redundant_same_expression_seq_container(m, candidate_contained_matches))
226        .collect()
227}
228
229fn is_redundant_low_coverage_composite_seq_wrapper(
230    container: &LicenseMatch,
231    candidate_contained_matches: &[LicenseMatch],
232) -> bool {
233    if container.matcher != seq_match::MATCH_SEQ || container.match_coverage >= 30.0 {
234        return false;
235    }
236
237    let container_qspan_set: BitSet = container.qspan_bitset();
238
239    let children: Vec<(&LicenseMatch, Vec<usize>)> = candidate_contained_matches
240        .iter()
241        .filter_map(|m| {
242            if m.matcher == aho_match::MATCH_AHO
243                && has_full_match_coverage(m)
244                && m.license_expression != container.license_expression
245                && (container.qcontains_with_set(m, &container_qspan_set)
246                    || container.qoverlap_with_set(m, &container_qspan_set) > 0)
247            {
248                Some((m, m.qspan()))
249            } else {
250                None
251            }
252        })
253        .collect();
254
255    if children.len() < 2 {
256        return false;
257    }
258
259    let unique_expressions: HashSet<&str> = children
260        .iter()
261        .map(|(m, _)| m.license_expression.as_str())
262        .collect();
263    if unique_expressions.len() < 2 {
264        return false;
265    }
266
267    let mut child_union = BitSet::new();
268    for (_, qspan) in &children {
269        for &pos in qspan {
270            child_union.insert(pos);
271        }
272    }
273
274    let container_only_positions: BitSet = container_qspan_set.difference(&child_union).collect();
275    let child_only_positions: BitSet = child_union.difference(&container_qspan_set).collect();
276
277    let mut sorted_children = children;
278    sorted_children.sort_by_key(|(m, _)| m.qspan_bounds());
279
280    let mut bridge_positions = BitSet::new();
281    for pair in sorted_children.windows(2) {
282        let (_, previous_end) = pair[0].0.qspan_bounds();
283        let (next_start, _) = pair[1].0.qspan_bounds();
284        for pos in previous_end..next_start {
285            bridge_positions.insert(pos);
286        }
287    }
288
289    let container_only_boundary_positions = container_only_positions
290        .difference(&bridge_positions)
291        .count();
292
293    child_only_positions.is_empty()
294        && container_only_positions.count() <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
295        && container_only_boundary_positions <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
296}
297
298fn filter_redundant_low_coverage_composite_seq_wrappers(
299    seq_matches: Vec<LicenseMatch>,
300    candidate_contained_matches: &[LicenseMatch],
301) -> Vec<LicenseMatch> {
302    seq_matches
303        .into_iter()
304        .filter(|m| {
305            !is_redundant_low_coverage_composite_seq_wrapper(m, candidate_contained_matches)
306        })
307        .collect()
308}
309
310fn subtract_spdx_match_qspans(
311    query: &mut Query<'_>,
312    matched_qspans: &mut Vec<query::PositionSpan>,
313    aho_extra_matchables: &mut BitSet,
314    spdx_matches: &[LicenseMatch],
315) {
316    for m in spdx_matches {
317        let Some(span) = query_span_for_match(m) else {
318            continue;
319        };
320
321        for pos in span.iter() {
322            aho_extra_matchables.insert(pos);
323        }
324        query.subtract(&span);
325
326        if (m.match_coverage * 100.0).round() / 100.0 == 100.0 {
327            matched_qspans.push(span);
328        }
329    }
330}
331
332fn merge_and_prepare_aho_matches(
333    index: &index::LicenseIndex,
334    query: &mut Query<'_>,
335    matched_qspans: &mut Vec<query::PositionSpan>,
336    refined_aho: &[LicenseMatch],
337) -> (Vec<LicenseMatch>, bool) {
338    let merged_aho = merge_overlapping_matches(refined_aho);
339    let mut saw_long_exact_license_text_match = false;
340
341    for m in &merged_aho {
342        let Some(span) = query_span_for_match(m) else {
343            continue;
344        };
345
346        if has_full_match_coverage(m) {
347            matched_qspans.push(span.clone());
348        }
349
350        if index
351            .rules_by_rid
352            .get(m.rid)
353            .is_some_and(|rule| rule.is_license_text())
354            && m.rule_length > 120
355            && m.match_coverage > 98.0
356        {
357            query.subtract(&span);
358            saw_long_exact_license_text_match = true;
359        }
360    }
361
362    (merged_aho, saw_long_exact_license_text_match)
363}
364
365fn collect_whole_query_exact_followup_matches(
366    index: &index::LicenseIndex,
367    query: &mut Query<'_>,
368    matched_qspans: &mut Vec<query::PositionSpan>,
369    whole_run: &query::QueryRun<'_>,
370) -> Vec<LicenseMatch> {
371    let mut seq_all_matches = Vec::new();
372
373    if whole_run.is_matchable(false, matched_qspans) {
374        let near_dupe_candidates =
375            compute_candidates_with_msets(index, whole_run, true, MAX_NEAR_DUPE_CANDIDATES);
376
377        if !near_dupe_candidates.is_empty() {
378            let near_dupe_matches =
379                seq_match_with_candidates(index, whole_run, &near_dupe_candidates);
380
381            for m in &near_dupe_matches {
382                if m.end_token > m.start_token {
383                    let span = query::PositionSpan::new(m.start_token, m.end_token - 1);
384                    query.subtract(&span);
385                    matched_qspans.push(span);
386                }
387            }
388
389            seq_all_matches.extend(near_dupe_matches);
390        }
391    }
392
393    seq_all_matches
394}
395
396fn collect_regular_seq_matches(
397    index: &index::LicenseIndex,
398    query: &Query<'_>,
399    matched_qspans: &[query::PositionSpan],
400    candidate_contained_matches: &[LicenseMatch],
401) -> Vec<LicenseMatch> {
402    let mut seq_all_matches = Vec::new();
403
404    for query_run in query.query_runs() {
405        if !query_run.is_matchable(false, matched_qspans) {
406            continue;
407        }
408
409        let candidates =
410            compute_candidates_with_msets(index, &query_run, false, MAX_REGULAR_SEQ_CANDIDATES);
411        if !candidates.is_empty() {
412            let matches = seq_match_with_candidates(index, &query_run, &candidates);
413            seq_all_matches.extend(matches);
414        }
415    }
416
417    let merged_seq = merge_overlapping_matches(&seq_all_matches);
418    let filtered_same_expression =
419        filter_redundant_same_expression_seq_containers(merged_seq, candidate_contained_matches);
420    filter_redundant_low_coverage_composite_seq_wrappers(
421        filtered_same_expression,
422        candidate_contained_matches,
423    )
424}
425
426impl LicenseDetectionEngine {
427    /// Create a new license detection engine from a pre-built license index.
428    ///
429    /// This is an internal constructor used by `from_directory()` and `from_embedded()`.
430    /// It builds the SPDX mapping from the licenses in the index.
431    fn from_index(index: index::LicenseIndex) -> Result<Self> {
432        let mut license_vec: Vec<_> = index.licenses_by_key.values().cloned().collect();
433        license_vec.sort_by(|a, b| a.key.cmp(&b.key));
434        let spdx_mapping = build_spdx_mapping(&license_vec);
435
436        Ok(Self {
437            index: Arc::new(index),
438            spdx_mapping,
439        })
440    }
441
442    /// Create a new license detection engine from the embedded license index.
443    ///
444    /// This method loads the build-time embedded license artifact and constructs
445    /// the runtime license index. This eliminates the runtime dependency on the
446    /// ScanCode rules directory.
447    ///
448    /// # Returns
449    /// A Result containing the engine or an error
450    pub fn from_embedded() -> Result<Self> {
451        let artifact_bytes =
452            include_bytes!("../../resources/license_detection/license_index_loader.msgpack.zst");
453        let decompressed = zstd::decode_all(&artifact_bytes[..])
454            .map_err(|e| anyhow::anyhow!("Failed to decompress embedded artifact: {}", e))?;
455        let snapshot: embedded::schema::EmbeddedLoaderSnapshot =
456            rmp_serde::from_slice(&decompressed)
457                .map_err(|e| anyhow::anyhow!("Failed to deserialize embedded artifact: {}", e))?;
458
459        if snapshot.schema_version != embedded::schema::SCHEMA_VERSION {
460            anyhow::bail!(
461                "Embedded artifact schema version mismatch: expected {}, got {}",
462                embedded::schema::SCHEMA_VERSION,
463                snapshot.schema_version
464            );
465        }
466
467        let index = build_index_from_loaded(snapshot.rules, snapshot.licenses, false);
468        Self::from_index(index)
469    }
470
471    /// Create a new license detection engine from a directory of license rules.
472    ///
473    /// # Arguments
474    /// * `rules_path` - Path to directory containing .LICENSE and .RULE files
475    ///
476    /// # Returns
477    /// A Result containing the engine or an error
478    pub fn from_directory(rules_path: &Path) -> Result<Self> {
479        let (rules_dir, licenses_dir) = if rules_path.ends_with("data") {
480            (rules_path.join("rules"), rules_path.join("licenses"))
481        } else if rules_path.ends_with("rules") {
482            let parent = rules_path.parent().ok_or_else(|| {
483                anyhow::anyhow!("Cannot determine parent directory for rules path")
484            })?;
485            (rules_path.to_path_buf(), parent.join("licenses"))
486        } else {
487            (rules_path.to_path_buf(), rules_path.to_path_buf())
488        };
489
490        let loaded_rules = load_loaded_rules_from_directory(&rules_dir)?;
491        let loaded_licenses = load_loaded_licenses_from_directory(&licenses_dir)?;
492        let index = build_index_from_loaded(loaded_rules, loaded_licenses, false);
493
494        Self::from_index(index)
495    }
496
497    pub fn detect_with_kind(
498        &self,
499        text: &str,
500        unknown_licenses: bool,
501        binary_derived: bool,
502    ) -> Result<Vec<LicenseDetection>> {
503        let clean_text = strip_utf8_bom_str(text);
504
505        let content = if clean_text.len() > MAX_DETECTION_SIZE {
506            log::warn!(
507                "Content size {} exceeds limit {}, truncating for detection",
508                clean_text.len(),
509                MAX_DETECTION_SIZE
510            );
511            &clean_text[..MAX_DETECTION_SIZE]
512        } else {
513            clean_text
514        };
515
516        let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
517        let whole_query_run = query.whole_query_run();
518
519        let mut all_matches = Vec::new();
520        let mut candidate_contained_matches = Vec::new();
521        let mut aho_extra_matchables = BitSet::new();
522        let mut matched_qspans: Vec<query::PositionSpan> = Vec::new();
523
524        // Phase 1a: Hash matching
525        // Python returns immediately if hash matches found (index.py:987-991)
526        {
527            let hash_matches = hash_match(&self.index, &whole_query_run);
528
529            if !hash_matches.is_empty() {
530                let mut matches = hash_matches;
531                sort_matches_by_line(&mut matches);
532
533                let groups = group_matches_by_region(&matches);
534                let detections: Vec<LicenseDetection> = groups
535                    .iter()
536                    .map(|group| {
537                        let mut detection = create_detection_from_group(group);
538                        populate_detection_from_group_with_spdx(
539                            &mut detection,
540                            group,
541                            &self.spdx_mapping,
542                        );
543                        detection
544                    })
545                    .collect();
546
547                return Ok(post_process_detections(detections, 0.0));
548            }
549        }
550
551        // Phase 1b: SPDX-LID matching
552        {
553            let spdx_matches = spdx_lid_match(&self.index, &query);
554            let merged_spdx = merge_overlapping_matches(&spdx_matches);
555            subtract_spdx_match_qspans(
556                &mut query,
557                &mut matched_qspans,
558                &mut aho_extra_matchables,
559                &merged_spdx,
560            );
561            all_matches.extend(merged_spdx);
562        }
563
564        // Phase 1c: Aho-Corasick matching
565        {
566            let aho_matches = if aho_extra_matchables.is_empty() {
567                aho_match(&self.index, &whole_query_run)
568            } else {
569                aho_match::aho_match_with_extra_matchables(
570                    &self.index,
571                    &whole_query_run,
572                    Some(&aho_extra_matchables),
573                )
574            };
575
576            // Python's get_exact_matches() calls refine_matches with merge=False
577            // This applies quality filters including required phrase filtering
578            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
579            candidate_contained_matches.extend(refined_aho.clone());
580            let (merged_aho, _) = merge_and_prepare_aho_matches(
581                &self.index,
582                &mut query,
583                &mut matched_qspans,
584                &refined_aho,
585            );
586            all_matches.extend(merged_aho);
587
588            let whole_query_followup = collect_whole_query_exact_followup_matches(
589                &self.index,
590                &mut query,
591                &mut matched_qspans,
592                &whole_query_run,
593            );
594            all_matches.extend(whole_query_followup);
595
596            let merged_seq = collect_regular_seq_matches(
597                &self.index,
598                &query,
599                &matched_qspans,
600                &candidate_contained_matches,
601            );
602            all_matches.extend(merged_seq);
603        }
604
605        // Step 1: Initial refine WITHOUT false positive filtering
606        // Python: refine_matches with filter_false_positive=False (index.py:1073-1080)
607        let merged_matches =
608            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
609
610        // Step 2: Unknown detection and weak match handling
611        // Python: index.py:1079-1118 - only runs when unknown_licenses=True
612        let refined_matches = if unknown_licenses {
613            // Split weak from good - Python: index.py:1083
614            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
615
616            // Unknown detection on uncovered regions - Python: index.py:1093-1114
617            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
618            let filtered_unknown =
619                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
620
621            let mut all_matches = good_matches;
622            all_matches.extend(filtered_unknown);
623            // reinject weak matches and let refine matches keep the bests
624            // Python: index.py:1117-1118
625            all_matches.extend(weak_matches);
626            all_matches
627        } else {
628            merged_matches
629        };
630
631        // Step 5: Final refine WITH false positive filtering - Python: index.py:1130-1145
632        let refined = refine_matches(&self.index, refined_matches, &query);
633
634        let mut sorted = refined;
635        sort_matches_by_line(&mut sorted);
636
637        let groups = group_matches_by_region(&sorted);
638
639        let detections: Vec<LicenseDetection> = groups
640            .iter()
641            .map(|group| {
642                let mut detection = create_detection_from_group(group);
643                populate_detection_from_group_with_spdx(&mut detection, group, &self.spdx_mapping);
644                detection
645            })
646            .collect();
647
648        let detections = post_process_detections(detections, 0.0);
649
650        Ok(detections)
651    }
652
653    /// Detect licenses and return raw matches (like Python's idx.match()).
654    ///
655    /// This method is only used by unit/golden tests for parity checks.
656    #[cfg(test)]
657    pub fn detect_matches_with_kind(
658        &self,
659        text: &str,
660        unknown_licenses: bool,
661        binary_derived: bool,
662    ) -> Result<Vec<LicenseMatch>> {
663        let clean_text = strip_utf8_bom_str(text);
664
665        let content = if clean_text.len() > MAX_DETECTION_SIZE {
666            log::warn!(
667                "Content size {} exceeds limit {}, truncating for detection",
668                clean_text.len(),
669                MAX_DETECTION_SIZE
670            );
671            &clean_text[..MAX_DETECTION_SIZE]
672        } else {
673            clean_text
674        };
675
676        let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
677        let whole_query_run = query.whole_query_run();
678
679        let mut all_matches = Vec::new();
680        let mut candidate_contained_matches = Vec::new();
681        let mut aho_extra_matchables = BitSet::new();
682        let mut matched_qspans: Vec<query::PositionSpan> = Vec::new();
683
684        // Phase 1a: Hash matching
685        {
686            let hash_matches = hash_match(&self.index, &whole_query_run);
687
688            if !hash_matches.is_empty() {
689                let mut matches = hash_matches;
690                sort_matches_by_line(&mut matches);
691                return Ok(matches);
692            }
693        }
694
695        // Phase 1b: SPDX-LID matching
696        {
697            let spdx_matches = spdx_lid_match(&self.index, &query);
698            let merged_spdx = merge_overlapping_matches(&spdx_matches);
699            subtract_spdx_match_qspans(
700                &mut query,
701                &mut matched_qspans,
702                &mut aho_extra_matchables,
703                &merged_spdx,
704            );
705            all_matches.extend(merged_spdx);
706        }
707
708        // Phase 1c: Aho-Corasick matching
709        {
710            let aho_matches = if aho_extra_matchables.is_empty() {
711                aho_match(&self.index, &whole_query_run)
712            } else {
713                aho_match::aho_match_with_extra_matchables(
714                    &self.index,
715                    &whole_query_run,
716                    Some(&aho_extra_matchables),
717                )
718            };
719            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
720            candidate_contained_matches.extend(refined_aho.clone());
721            let (merged_aho, _) = merge_and_prepare_aho_matches(
722                &self.index,
723                &mut query,
724                &mut matched_qspans,
725                &refined_aho,
726            );
727            all_matches.extend(merged_aho);
728
729            let whole_query_followup = collect_whole_query_exact_followup_matches(
730                &self.index,
731                &mut query,
732                &mut matched_qspans,
733                &whole_query_run,
734            );
735            all_matches.extend(whole_query_followup);
736
737            let merged_seq = collect_regular_seq_matches(
738                &self.index,
739                &query,
740                &matched_qspans,
741                &candidate_contained_matches,
742            );
743            all_matches.extend(merged_seq);
744        }
745
746        // Step 1: Initial refine WITHOUT false positive filtering
747        let merged_matches =
748            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
749
750        // Step 2: Unknown detection and weak match handling
751        let refined_matches = if unknown_licenses {
752            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
753            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
754            let filtered_unknown =
755                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
756
757            let mut all_matches = good_matches;
758            all_matches.extend(filtered_unknown);
759            all_matches.extend(weak_matches);
760            all_matches
761        } else {
762            merged_matches
763        };
764
765        // Step 3: Final refine WITH false positive filtering - Python: index.py:1130-1145
766        let refined = refine_matches(&self.index, refined_matches, &query);
767
768        let mut sorted = refined;
769        sort_matches_by_line(&mut sorted);
770
771        // Return raw matches (NOT grouped) - this is Python's idx.match() behavior
772        Ok(sorted)
773    }
774
775    /// Get a reference to the license index.
776    pub fn index(&self) -> &index::LicenseIndex {
777        &self.index
778    }
779
780    /// Get a reference to the SPDX mapping.
781    #[cfg(test)]
782    pub fn spdx_mapping(&self) -> &SpdxMapping {
783        &self.spdx_mapping
784    }
785}
786
787#[cfg(test)]
788mod tests;