Skip to main content

provenant/license_detection/
mod.rs

1//! License Detection Engine
2
3pub mod aho_match;
4pub mod automaton;
5pub(crate) mod detection;
6pub mod embedded;
7
8#[cfg(test)]
9mod embedded_test;
10pub mod expression;
11#[cfg(test)]
12mod golden_test;
13pub mod hash_match;
14pub mod index;
15mod match_refine;
16pub mod models;
17pub mod query;
18pub mod rules;
19pub mod seq_match;
20pub mod spans;
21pub mod spdx_lid;
22pub mod spdx_mapping;
23#[cfg(test)]
24mod test_utils;
25pub mod tokenize;
26pub mod unknown_match;
27
28use bit_set::BitSet;
29use std::collections::HashSet;
30use std::path::Path;
31use std::sync::Arc;
32
33use anyhow::Result;
34
35use crate::license_detection::embedded::index::load_license_index_from_bytes;
36use crate::license_detection::index::build_index_from_loaded;
37use crate::license_detection::query::Query;
38use crate::license_detection::rules::{
39    load_loaded_licenses_from_directory, load_loaded_rules_from_directory,
40};
41use crate::license_detection::spdx_mapping::{SpdxMapping, build_spdx_mapping};
42use crate::utils::text::strip_utf8_bom_str;
43
44use crate::license_detection::detection::{
45    attach_source_path_to_detections, empty_detection, populate_detection_from_group_with_spdx,
46};
47use crate::license_detection::models::MatcherKind;
48
49/// Path to the license rules directory in the reference scancode-toolkit submodule.
50/// Used by test code and the xtask generate-license-loader-artifact binary.
51#[allow(dead_code)]
52pub const SCANCODE_LICENSES_RULES_PATH: &str =
53    "reference/scancode-toolkit/src/licensedcode/data/rules";
54
55/// Path to the licenses directory in the reference scancode-toolkit submodule.
56/// Used by test code and the xtask generate-license-loader-artifact binary.
57#[allow(dead_code)]
58pub const SCANCODE_LICENSES_LICENSES_PATH: &str =
59    "reference/scancode-toolkit/src/licensedcode/data/licenses";
60
61/// Path to the license data directory in the reference scancode-toolkit submodule.
62/// Used by test code and the xtask generate-license-loader-artifact binary.
63#[allow(dead_code)]
64pub const SCANCODE_LICENSES_DATA_PATH: &str = "reference/scancode-toolkit/src/licensedcode/data";
65
66pub(crate) use detection::{
67    LicenseDetection, group_matches_by_region, post_process_detections, sort_matches_by_line,
68};
69pub use models::LicenseMatch;
70
71pub use aho_match::aho_match;
72pub use hash_match::hash_match;
73pub use match_refine::{
74    filter_invalid_contained_unknown_matches, merge_overlapping_matches, refine_matches,
75    refine_matches_without_false_positive_filter, split_weak_matches,
76};
77pub use seq_match::{
78    MAX_NEAR_DUPE_CANDIDATES, compute_candidates_with_msets, seq_match_with_candidates,
79};
80pub use spdx_lid::spdx_lid_match;
81pub use unknown_match::unknown_match;
82
83/// License detection engine that orchestrates the detection pipeline.
84///
85/// The engine loads license rules and builds an index for efficient matching.
86/// It supports multiple matching strategies (hash, SPDX-LID, Aho-Corasick, sequence)
87/// and combines their results into final license detections.
88#[derive(Debug, Clone)]
89pub struct LicenseDetectionEngine {
90    index: Arc<index::LicenseIndex>,
91    spdx_mapping: SpdxMapping,
92}
93
94const MAX_DETECTION_SIZE: usize = 10 * 1024 * 1024; // 10MB
95const MAX_REGULAR_SEQ_CANDIDATES: usize = 70;
96const MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP: usize = 8;
97const MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP: usize = 2;
98
99fn truncate_detection_text(clean_text: &str) -> &str {
100    if clean_text.len() <= MAX_DETECTION_SIZE {
101        return clean_text;
102    }
103
104    log::warn!(
105        "Content size {} exceeds limit {}, truncating for detection",
106        clean_text.len(),
107        MAX_DETECTION_SIZE
108    );
109
110    let boundary = clean_text.floor_char_boundary(MAX_DETECTION_SIZE);
111    &clean_text[..boundary]
112}
113
114fn query_span_for_match(m: &LicenseMatch) -> Option<query::PositionSpan> {
115    (m.end_token > m.start_token).then(|| query::PositionSpan::new(m.start_token, m.end_token - 1))
116}
117
118fn has_full_match_coverage(m: &LicenseMatch) -> bool {
119    ((m.match_coverage * 100.0).round() / 100.0) == 100.0
120}
121
122fn is_redundant_same_expression_seq_container(
123    container: &LicenseMatch,
124    candidate_contained_matches: &[LicenseMatch],
125) -> bool {
126    let container_is_redundant_coverage =
127        has_full_match_coverage(container) || container.match_coverage >= 99.0;
128    if container.matcher != MatcherKind::Seq || !container_is_redundant_coverage {
129        return false;
130    }
131
132    let container_qspan_set: BitSet = container.qspan_bitset();
133
134    let mut contained: Vec<(&LicenseMatch, Vec<usize>)> = candidate_contained_matches
135        .iter()
136        .filter_map(|m| {
137            if m.matcher == MatcherKind::Aho
138                && has_full_match_coverage(m)
139                && m.license_expression == container.license_expression
140                && (container.qcontains_with_set(m, &container_qspan_set)
141                    || container.qoverlap_with_set(m, &container_qspan_set) > 0)
142            {
143                Some((m, m.qspan()))
144            } else {
145                None
146            }
147        })
148        .collect();
149
150    if contained.len() < 2 {
151        return false;
152    }
153
154    let material_children = contained
155        .iter()
156        .filter(|(m, _)| m.matched_length > 1)
157        .count();
158    if material_children < 2 {
159        return false;
160    }
161
162    contained.sort_by_key(|(m, _)| m.qspan_bounds());
163
164    let mut child_union = BitSet::new();
165    for (_, qspan) in &contained {
166        for &pos in qspan {
167            child_union.insert(pos);
168        }
169    }
170
171    let container_only_positions: BitSet = container_qspan_set.difference(&child_union).collect();
172    let child_only_positions: BitSet = child_union.difference(&container_qspan_set).collect();
173
174    let mut bridge_positions = BitSet::new();
175    for pair in contained.windows(2) {
176        let (_, previous_end) = pair[0].0.qspan_bounds();
177        let (next_start, _) = pair[1].0.qspan_bounds();
178
179        if next_start < previous_end {
180            return false;
181        }
182
183        for pos in previous_end..next_start {
184            bridge_positions.insert(pos);
185        }
186    }
187
188    let container_only_boundary_positions = container_only_positions
189        .difference(&bridge_positions)
190        .count();
191
192    if container_only_positions.count() == 1
193        && container_only_boundary_positions == 0
194        && child_only_positions.is_empty()
195    {
196        return false;
197    }
198
199    if child_only_positions.is_empty()
200        && container_only_positions.count() == container_only_boundary_positions
201        && container_only_boundary_positions <= 3
202    {
203        let earliest_child = contained
204            .iter()
205            .map(|(m, _)| m.qspan_bounds().0)
206            .min()
207            .unwrap_or(usize::MAX);
208        let latest_child = contained
209            .iter()
210            .map(|(m, _)| m.qspan_bounds().1.saturating_sub(1))
211            .max()
212            .unwrap_or(0);
213
214        let is_one_sided_boundary = container_only_positions
215            .iter()
216            .all(|pos| pos < earliest_child)
217            || container_only_positions
218                .iter()
219                .all(|pos| pos > latest_child);
220
221        if is_one_sided_boundary {
222            return false;
223        }
224    }
225
226    let max_container_only_positions =
227        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * contained.len() + 1;
228    let max_container_boundary_positions =
229        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * (contained.len() - 1);
230    let max_child_only_positions = MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP + 1;
231
232    container_only_positions.count() <= max_container_only_positions
233        && container_only_boundary_positions <= max_container_boundary_positions
234        && child_only_positions.count() <= max_child_only_positions
235}
236
237fn filter_redundant_same_expression_seq_containers(
238    seq_matches: Vec<LicenseMatch>,
239    candidate_contained_matches: &[LicenseMatch],
240) -> Vec<LicenseMatch> {
241    seq_matches
242        .into_iter()
243        .filter(|m| !is_redundant_same_expression_seq_container(m, candidate_contained_matches))
244        .collect()
245}
246
247fn is_redundant_low_coverage_composite_seq_wrapper(
248    container: &LicenseMatch,
249    candidate_contained_matches: &[LicenseMatch],
250) -> bool {
251    if container.matcher != seq_match::MATCH_SEQ || container.match_coverage >= 30.0 {
252        return false;
253    }
254
255    let container_qspan_set: BitSet = container.qspan_bitset();
256
257    let children: Vec<(&LicenseMatch, Vec<usize>)> = candidate_contained_matches
258        .iter()
259        .filter_map(|m| {
260            if m.matcher == aho_match::MATCH_AHO
261                && has_full_match_coverage(m)
262                && m.license_expression != container.license_expression
263                && (container.qcontains_with_set(m, &container_qspan_set)
264                    || container.qoverlap_with_set(m, &container_qspan_set) > 0)
265            {
266                Some((m, m.qspan()))
267            } else {
268                None
269            }
270        })
271        .collect();
272
273    if children.len() < 2 {
274        return false;
275    }
276
277    let unique_expressions: HashSet<&str> = children
278        .iter()
279        .map(|(m, _)| m.license_expression.as_str())
280        .collect();
281    if unique_expressions.len() < 2 {
282        return false;
283    }
284
285    let mut child_union = BitSet::new();
286    for (_, qspan) in &children {
287        for &pos in qspan {
288            child_union.insert(pos);
289        }
290    }
291
292    let container_only_positions: BitSet = container_qspan_set.difference(&child_union).collect();
293    let child_only_positions: BitSet = child_union.difference(&container_qspan_set).collect();
294
295    let mut sorted_children = children;
296    sorted_children.sort_by_key(|(m, _)| m.qspan_bounds());
297
298    let mut bridge_positions = BitSet::new();
299    for pair in sorted_children.windows(2) {
300        let (_, previous_end) = pair[0].0.qspan_bounds();
301        let (next_start, _) = pair[1].0.qspan_bounds();
302        for pos in previous_end..next_start {
303            bridge_positions.insert(pos);
304        }
305    }
306
307    let container_only_boundary_positions = container_only_positions
308        .difference(&bridge_positions)
309        .count();
310
311    child_only_positions.is_empty()
312        && container_only_positions.count() <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
313        && container_only_boundary_positions <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
314}
315
316fn filter_redundant_low_coverage_composite_seq_wrappers(
317    seq_matches: Vec<LicenseMatch>,
318    candidate_contained_matches: &[LicenseMatch],
319) -> Vec<LicenseMatch> {
320    seq_matches
321        .into_iter()
322        .filter(|m| {
323            !is_redundant_low_coverage_composite_seq_wrapper(m, candidate_contained_matches)
324        })
325        .collect()
326}
327
328fn subtract_spdx_match_qspans(
329    query: &mut Query<'_>,
330    matched_qspans: &mut Vec<query::PositionSpan>,
331    aho_extra_matchables: &mut BitSet,
332    spdx_matches: &[LicenseMatch],
333) {
334    for m in spdx_matches {
335        let Some(span) = query_span_for_match(m) else {
336            continue;
337        };
338
339        for pos in span.iter() {
340            aho_extra_matchables.insert(pos);
341        }
342        query.subtract(&span);
343
344        if (m.match_coverage * 100.0).round() / 100.0 == 100.0 {
345            matched_qspans.push(span);
346        }
347    }
348}
349
350fn merge_and_prepare_aho_matches(
351    index: &index::LicenseIndex,
352    query: &mut Query<'_>,
353    matched_qspans: &mut Vec<query::PositionSpan>,
354    refined_aho: &[LicenseMatch],
355) -> (Vec<LicenseMatch>, bool) {
356    let merged_aho = merge_overlapping_matches(refined_aho);
357    let mut saw_long_exact_license_text_match = false;
358
359    for m in &merged_aho {
360        let Some(span) = query_span_for_match(m) else {
361            continue;
362        };
363
364        if has_full_match_coverage(m) {
365            matched_qspans.push(span.clone());
366        }
367
368        if index
369            .rules_by_rid
370            .get(m.rid)
371            .is_some_and(|rule| rule.is_license_text())
372            && m.rule_length > 120
373            && m.match_coverage > 98.0
374        {
375            query.subtract(&span);
376            saw_long_exact_license_text_match = true;
377        }
378    }
379
380    (merged_aho, saw_long_exact_license_text_match)
381}
382
383fn collect_whole_query_exact_followup_matches(
384    index: &index::LicenseIndex,
385    query: &mut Query<'_>,
386    matched_qspans: &mut Vec<query::PositionSpan>,
387    whole_run: &query::QueryRun<'_>,
388) -> Vec<LicenseMatch> {
389    let mut seq_all_matches = Vec::new();
390
391    if whole_run.is_matchable(false, matched_qspans) {
392        let near_dupe_candidates =
393            compute_candidates_with_msets(index, whole_run, true, MAX_NEAR_DUPE_CANDIDATES);
394
395        if !near_dupe_candidates.is_empty() {
396            let near_dupe_matches =
397                seq_match_with_candidates(index, whole_run, &near_dupe_candidates);
398
399            for m in &near_dupe_matches {
400                if m.end_token > m.start_token {
401                    let span = query::PositionSpan::new(m.start_token, m.end_token - 1);
402                    query.subtract(&span);
403                    matched_qspans.push(span);
404                }
405            }
406
407            seq_all_matches.extend(near_dupe_matches);
408        }
409    }
410
411    seq_all_matches
412}
413
414fn collect_regular_seq_matches(
415    index: &index::LicenseIndex,
416    query: &Query<'_>,
417    matched_qspans: &[query::PositionSpan],
418    candidate_contained_matches: &[LicenseMatch],
419) -> Vec<LicenseMatch> {
420    let mut seq_all_matches = Vec::new();
421
422    for query_run in query.query_runs() {
423        if !query_run.is_matchable(false, matched_qspans) {
424            continue;
425        }
426
427        let candidates =
428            compute_candidates_with_msets(index, &query_run, false, MAX_REGULAR_SEQ_CANDIDATES);
429        if !candidates.is_empty() {
430            let matches = seq_match_with_candidates(index, &query_run, &candidates);
431            seq_all_matches.extend(matches);
432        }
433    }
434
435    let merged_seq = merge_overlapping_matches(&seq_all_matches);
436    let filtered_same_expression =
437        filter_redundant_same_expression_seq_containers(merged_seq, candidate_contained_matches);
438    filter_redundant_low_coverage_composite_seq_wrappers(
439        filtered_same_expression,
440        candidate_contained_matches,
441    )
442}
443
444impl LicenseDetectionEngine {
445    /// Create a new license detection engine from a pre-built license index.
446    ///
447    /// This is an internal constructor used by `from_directory()` and `from_embedded()`.
448    /// It builds the SPDX mapping from the licenses in the index.
449    fn from_index(index: index::LicenseIndex) -> Result<Self> {
450        let mut license_vec: Vec<_> = index.licenses_by_key.values().cloned().collect();
451        license_vec.sort_by(|a, b| a.key.cmp(&b.key));
452        let spdx_mapping = build_spdx_mapping(&license_vec);
453
454        Ok(Self {
455            index: Arc::new(index),
456            spdx_mapping,
457        })
458    }
459
460    /// Create a new license detection engine from the embedded license index.
461    ///
462    /// This method loads the build-time embedded license artifact and constructs
463    /// the runtime license index. This eliminates the runtime dependency on the
464    /// ScanCode rules directory.
465    ///
466    /// # Returns
467    /// A Result containing the engine or an error
468    pub fn from_embedded() -> Result<Self> {
469        let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
470        let index = load_license_index_from_bytes(artifact_bytes)
471            .map_err(|e| anyhow::anyhow!("Failed to load embedded license index: {}", e))?;
472        Self::from_index(index)
473    }
474
475    /// Create a new license detection engine from a directory of license rules.
476    ///
477    /// # Arguments
478    /// * `rules_path` - Path to directory containing .LICENSE and .RULE files
479    ///
480    /// # Returns
481    /// A Result containing the engine or an error
482    pub fn from_directory(rules_path: &Path) -> Result<Self> {
483        let (rules_dir, licenses_dir) = if rules_path.ends_with("data") {
484            (rules_path.join("rules"), rules_path.join("licenses"))
485        } else if rules_path.ends_with("rules") {
486            let parent = rules_path.parent().ok_or_else(|| {
487                anyhow::anyhow!("Cannot determine parent directory for rules path")
488            })?;
489            (rules_path.to_path_buf(), parent.join("licenses"))
490        } else {
491            (rules_path.to_path_buf(), rules_path.to_path_buf())
492        };
493
494        let loaded_rules = load_loaded_rules_from_directory(&rules_dir)?;
495        let loaded_licenses = load_loaded_licenses_from_directory(&licenses_dir)?;
496        let index = build_index_from_loaded(loaded_rules, loaded_licenses, false);
497
498        Self::from_index(index)
499    }
500
501    pub fn detect_with_kind(
502        &self,
503        text: &str,
504        unknown_licenses: bool,
505        binary_derived: bool,
506    ) -> Result<Vec<LicenseDetection>> {
507        let clean_text = strip_utf8_bom_str(text);
508
509        let content = truncate_detection_text(clean_text);
510
511        let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
512        let whole_query_run = query.whole_query_run();
513
514        let mut all_matches = Vec::new();
515        let mut candidate_contained_matches = Vec::new();
516        let mut aho_extra_matchables = BitSet::new();
517        let mut matched_qspans: Vec<query::PositionSpan> = Vec::new();
518
519        // Phase 1a: Hash matching
520        // Python returns immediately if hash matches found (index.py:987-991)
521        {
522            let hash_matches = hash_match(&self.index, &whole_query_run);
523
524            if !hash_matches.is_empty() {
525                let mut matches = hash_matches;
526                sort_matches_by_line(&mut matches);
527
528                let groups = group_matches_by_region(&matches);
529                let detections: Vec<LicenseDetection> = groups
530                    .iter()
531                    .map(|group| {
532                        let mut detection = empty_detection();
533                        populate_detection_from_group_with_spdx(
534                            &mut detection,
535                            group,
536                            &self.spdx_mapping,
537                        );
538                        detection
539                    })
540                    .collect();
541
542                return Ok(post_process_detections(detections, 0.0));
543            }
544        }
545
546        // Phase 1b: SPDX-LID matching
547        {
548            let spdx_matches = spdx_lid_match(&self.index, &query);
549            let merged_spdx = merge_overlapping_matches(&spdx_matches);
550            subtract_spdx_match_qspans(
551                &mut query,
552                &mut matched_qspans,
553                &mut aho_extra_matchables,
554                &merged_spdx,
555            );
556            all_matches.extend(merged_spdx);
557        }
558
559        // Phase 1c: Aho-Corasick matching
560        {
561            let aho_matches = if aho_extra_matchables.is_empty() {
562                aho_match(&self.index, &whole_query_run)
563            } else {
564                aho_match::aho_match_with_extra_matchables(
565                    &self.index,
566                    &whole_query_run,
567                    Some(&aho_extra_matchables),
568                )
569            };
570
571            // Python's get_exact_matches() calls refine_matches with merge=False
572            // This applies quality filters including required phrase filtering
573            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
574            candidate_contained_matches.extend(refined_aho.clone());
575            let (merged_aho, _) = merge_and_prepare_aho_matches(
576                &self.index,
577                &mut query,
578                &mut matched_qspans,
579                &refined_aho,
580            );
581            all_matches.extend(merged_aho);
582
583            let whole_query_followup = collect_whole_query_exact_followup_matches(
584                &self.index,
585                &mut query,
586                &mut matched_qspans,
587                &whole_query_run,
588            );
589            all_matches.extend(whole_query_followup);
590
591            let merged_seq = collect_regular_seq_matches(
592                &self.index,
593                &query,
594                &matched_qspans,
595                &candidate_contained_matches,
596            );
597            all_matches.extend(merged_seq);
598        }
599
600        // Step 1: Initial refine WITHOUT false positive filtering
601        // Python: refine_matches with filter_false_positive=False (index.py:1073-1080)
602        let merged_matches =
603            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
604
605        // Step 2: Unknown detection and weak match handling
606        // Python: index.py:1079-1118 - only runs when unknown_licenses=True
607        let refined_matches = if unknown_licenses {
608            // Split weak from good - Python: index.py:1083
609            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
610
611            // Unknown detection on uncovered regions - Python: index.py:1093-1114
612            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
613            let filtered_unknown =
614                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
615
616            let mut all_matches = good_matches;
617            all_matches.extend(filtered_unknown);
618            // reinject weak matches and let refine matches keep the bests
619            // Python: index.py:1117-1118
620            all_matches.extend(weak_matches);
621            all_matches
622        } else {
623            merged_matches
624        };
625
626        // Step 5: Final refine WITH false positive filtering - Python: index.py:1130-1145
627        let refined = refine_matches(&self.index, refined_matches, &query);
628
629        let mut sorted = refined;
630        sort_matches_by_line(&mut sorted);
631
632        let groups = group_matches_by_region(&sorted);
633
634        let detections: Vec<LicenseDetection> = groups
635            .iter()
636            .map(|group| {
637                let mut detection = empty_detection();
638                populate_detection_from_group_with_spdx(&mut detection, group, &self.spdx_mapping);
639                detection
640            })
641            .collect();
642
643        let detections = post_process_detections(detections, 0.0);
644
645        Ok(detections)
646    }
647
648    pub fn detect_with_kind_and_source(
649        &self,
650        text: &str,
651        unknown_licenses: bool,
652        binary_derived: bool,
653        source_path: &str,
654    ) -> Result<Vec<LicenseDetection>> {
655        let mut detections = self.detect_with_kind(text, unknown_licenses, binary_derived)?;
656        attach_source_path_to_detections(&mut detections, source_path);
657        Ok(detections)
658    }
659
660    /// Detect licenses and return raw matches (like Python's idx.match()).
661    ///
662    /// This method is only used by unit/golden tests for parity checks.
663    #[cfg(test)]
664    pub fn detect_matches_with_kind(
665        &self,
666        text: &str,
667        unknown_licenses: bool,
668        binary_derived: bool,
669    ) -> Result<Vec<LicenseMatch>> {
670        let clean_text = strip_utf8_bom_str(text);
671
672        let content = truncate_detection_text(clean_text);
673
674        let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
675        let whole_query_run = query.whole_query_run();
676
677        let mut all_matches = Vec::new();
678        let mut candidate_contained_matches = Vec::new();
679        let mut aho_extra_matchables = BitSet::new();
680        let mut matched_qspans: Vec<query::PositionSpan> = Vec::new();
681
682        // Phase 1a: Hash matching
683        {
684            let hash_matches = hash_match(&self.index, &whole_query_run);
685
686            if !hash_matches.is_empty() {
687                let mut matches = hash_matches;
688                sort_matches_by_line(&mut matches);
689                return Ok(matches);
690            }
691        }
692
693        // Phase 1b: SPDX-LID matching
694        {
695            let spdx_matches = spdx_lid_match(&self.index, &query);
696            let merged_spdx = merge_overlapping_matches(&spdx_matches);
697            subtract_spdx_match_qspans(
698                &mut query,
699                &mut matched_qspans,
700                &mut aho_extra_matchables,
701                &merged_spdx,
702            );
703            all_matches.extend(merged_spdx);
704        }
705
706        // Phase 1c: Aho-Corasick matching
707        {
708            let aho_matches = if aho_extra_matchables.is_empty() {
709                aho_match(&self.index, &whole_query_run)
710            } else {
711                aho_match::aho_match_with_extra_matchables(
712                    &self.index,
713                    &whole_query_run,
714                    Some(&aho_extra_matchables),
715                )
716            };
717            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
718            candidate_contained_matches.extend(refined_aho.clone());
719            let (merged_aho, _) = merge_and_prepare_aho_matches(
720                &self.index,
721                &mut query,
722                &mut matched_qspans,
723                &refined_aho,
724            );
725            all_matches.extend(merged_aho);
726
727            let whole_query_followup = collect_whole_query_exact_followup_matches(
728                &self.index,
729                &mut query,
730                &mut matched_qspans,
731                &whole_query_run,
732            );
733            all_matches.extend(whole_query_followup);
734
735            let merged_seq = collect_regular_seq_matches(
736                &self.index,
737                &query,
738                &matched_qspans,
739                &candidate_contained_matches,
740            );
741            all_matches.extend(merged_seq);
742        }
743
744        // Step 1: Initial refine WITHOUT false positive filtering
745        let merged_matches =
746            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
747
748        // Step 2: Unknown detection and weak match handling
749        let refined_matches = if unknown_licenses {
750            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
751            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
752            let filtered_unknown =
753                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
754
755            let mut all_matches = good_matches;
756            all_matches.extend(filtered_unknown);
757            all_matches.extend(weak_matches);
758            all_matches
759        } else {
760            merged_matches
761        };
762
763        // Step 3: Final refine WITH false positive filtering - Python: index.py:1130-1145
764        let refined = refine_matches(&self.index, refined_matches, &query);
765
766        let mut sorted = refined;
767        sort_matches_by_line(&mut sorted);
768
769        // Return raw matches (NOT grouped) - this is Python's idx.match() behavior
770        Ok(sorted)
771    }
772
773    /// Get a reference to the license index.
774    pub fn index(&self) -> &index::LicenseIndex {
775        &self.index
776    }
777
778    /// Get a reference to the SPDX mapping.
779    #[cfg(test)]
780    pub fn spdx_mapping(&self) -> &SpdxMapping {
781        &self.spdx_mapping
782    }
783}
784
785#[cfg(test)]
786mod tests;