Skip to main content

provenant/license_detection/
mod.rs

1//! License Detection Engine
2
3pub mod aho_match;
4pub mod automaton;
5pub(crate) mod detection;
6pub mod embedded;
7mod position_set;
8mod token_multiset;
9mod token_set;
10
11#[cfg(test)]
12mod embedded_test;
13pub mod expression;
14#[cfg(test)]
15mod golden_test;
16#[cfg(feature = "golden-tests")]
17pub mod golden_utils;
18pub mod hash_match;
19pub mod index;
20mod match_refine;
21pub mod models;
22pub mod query;
23pub mod rules;
24pub mod seq_match;
25pub mod spdx_lid;
26pub mod spdx_mapping;
27#[cfg(test)]
28mod test_utils;
29pub mod tokenize;
30pub mod unknown_match;
31
32use bit_set::BitSet;
33use std::collections::HashSet;
34use std::fs;
35use std::path::Path;
36use std::sync::Arc;
37
38use anyhow::Result;
39
40use crate::license_detection::embedded::index::{
41    load_embedded_artifact_metadata_from_bytes, load_embedded_license_index_from_bytes,
42};
43use crate::license_detection::index::build_index_from_loaded;
44use crate::license_detection::query::Query;
45use crate::license_detection::rules::{
46    load_loaded_licenses_from_directory, load_loaded_rules_from_directory,
47};
48use crate::license_detection::spdx_mapping::{SpdxMapping, build_spdx_mapping};
49use crate::utils::text::strip_utf8_bom_str;
50
51use crate::license_detection::detection::{
52    attach_source_path_to_detections, empty_detection, populate_detection_from_group_with_spdx,
53};
54use crate::license_detection::models::MatcherKind;
55
56/// Path to the license rules directory in the reference scancode-toolkit submodule.
57/// Used by test code and the xtask generate-license-loader-artifact binary.
58#[allow(dead_code)]
59pub const SCANCODE_LICENSES_RULES_PATH: &str =
60    "reference/scancode-toolkit/src/licensedcode/data/rules";
61
62/// Path to the licenses directory in the reference scancode-toolkit submodule.
63/// Used by test code and the xtask generate-license-loader-artifact binary.
64#[allow(dead_code)]
65pub const SCANCODE_LICENSES_LICENSES_PATH: &str =
66    "reference/scancode-toolkit/src/licensedcode/data/licenses";
67
68/// Path to the license data directory in the reference scancode-toolkit submodule.
69/// Used by test code and the xtask generate-license-loader-artifact binary.
70#[allow(dead_code)]
71pub const SCANCODE_LICENSES_DATA_PATH: &str = "reference/scancode-toolkit/src/licensedcode/data";
72
73pub const DEFAULT_LICENSEDB_URL_TEMPLATE: &str = "https://scancode-licensedb.aboutcode.org/{}";
74
75pub(crate) use detection::{
76    LicenseDetection, group_matches_by_region, post_process_detections, sort_matches_by_line,
77};
78pub use models::LicenseMatch;
79
80pub use aho_match::aho_match;
81pub use hash_match::hash_match;
82pub use match_refine::{
83    filter_invalid_contained_unknown_matches, merge_overlapping_matches, refine_matches,
84    refine_matches_without_false_positive_filter, split_weak_matches,
85};
86pub use position_set::PositionSet;
87pub use spdx_lid::spdx_lid_match;
88pub use token_multiset::TokenMultiset;
89pub use token_set::TokenSet;
90pub use unknown_match::unknown_match;
91
92use self::seq_match::{MAX_NEAR_DUPE_CANDIDATES, select_seq_candidates, seq_match_with_candidates};
93
94/// License detection engine that orchestrates the detection pipeline.
95///
96/// The engine loads license rules and builds an index for efficient matching.
97/// It supports multiple matching strategies (hash, SPDX-LID, Aho-Corasick, sequence)
98/// and combines their results into final license detections.
99#[derive(Debug, Clone)]
100pub struct LicenseDetectionEngine {
101    index: Arc<index::LicenseIndex>,
102    spdx_mapping: SpdxMapping,
103    spdx_license_list_version: Option<String>,
104}
105
106const MAX_DETECTION_SIZE: usize = 10 * 1024 * 1024; // 10MB
107const MAX_REGULAR_SEQ_CANDIDATES: usize = 70;
108const MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP: usize = 8;
109const MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP: usize = 2;
110
111fn truncate_detection_text(clean_text: &str) -> &str {
112    if clean_text.len() <= MAX_DETECTION_SIZE {
113        return clean_text;
114    }
115
116    log::debug!(
117        "Content size {} exceeds limit {}, truncating for detection",
118        clean_text.len(),
119        MAX_DETECTION_SIZE
120    );
121
122    let boundary = clean_text.floor_char_boundary(MAX_DETECTION_SIZE);
123    &clean_text[..boundary]
124}
125
126fn query_span_for_match(m: &LicenseMatch) -> Option<models::PositionSpan> {
127    (!m.query_span().is_empty()).then(|| m.query_span().clone())
128}
129
130fn has_full_match_coverage(m: &LicenseMatch) -> bool {
131    m.coverage() == 100.0
132}
133
134fn is_redundant_same_expression_seq_container(
135    container: &LicenseMatch,
136    candidate_contained_matches: &[LicenseMatch],
137) -> bool {
138    let container_is_redundant_coverage =
139        has_full_match_coverage(container) || container.coverage() >= 99.0;
140    if container.matcher != MatcherKind::Seq || !container_is_redundant_coverage {
141        return false;
142    }
143
144    let container_qspan_set = container.qspan_set();
145
146    let mut contained: Vec<&LicenseMatch> = candidate_contained_matches
147        .iter()
148        .filter(|m| {
149            m.matcher == MatcherKind::Aho
150                && has_full_match_coverage(m)
151                && m.license_expression == container.license_expression
152                && m.overlaps_with(&container_qspan_set)
153        })
154        .collect();
155
156    if contained.len() < 2 {
157        return false;
158    }
159
160    let material_children = contained.iter().filter(|m| m.matched_length > 1).count();
161    if material_children < 2 {
162        return false;
163    }
164
165    contained.sort_by_key(|m| m.qspan_bounds());
166
167    let mut child_union = PositionSet::new();
168    for m in &contained {
169        child_union.extend_from_span(m.query_span());
170    }
171
172    let container_only_positions = container_qspan_set.difference(&child_union);
173    let child_only_positions = child_union.difference(&container_qspan_set);
174
175    let mut bridge_positions = BitSet::new();
176    for pair in contained.windows(2) {
177        let (_, previous_end) = pair[0].qspan_bounds();
178        let (next_start, _) = pair[1].qspan_bounds();
179
180        if next_start < previous_end {
181            return false;
182        }
183
184        for pos in previous_end..next_start {
185            bridge_positions.insert(pos);
186        }
187    }
188
189    let container_only_boundary_positions = container_only_positions
190        .iter()
191        .filter(|&pos| !bridge_positions.contains(pos))
192        .count();
193
194    if container_only_positions.len() == 1
195        && container_only_boundary_positions == 0
196        && child_only_positions.is_empty()
197    {
198        return false;
199    }
200
201    if child_only_positions.is_empty()
202        && container_only_positions.len() == container_only_boundary_positions
203        && container_only_boundary_positions <= 3
204    {
205        let earliest_child = contained
206            .iter()
207            .map(|m| m.qspan_bounds().0)
208            .min()
209            .unwrap_or(usize::MAX);
210        let latest_child = contained
211            .iter()
212            .map(|m| m.qspan_bounds().1.saturating_sub(1))
213            .max()
214            .unwrap_or(0);
215
216        let is_one_sided_boundary = container_only_positions
217            .iter()
218            .all(|pos| pos < earliest_child)
219            || container_only_positions
220                .iter()
221                .all(|pos| pos > latest_child);
222
223        if is_one_sided_boundary {
224            return false;
225        }
226    }
227
228    let max_container_only_positions =
229        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * contained.len() + 1;
230    let max_container_boundary_positions =
231        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * (contained.len() - 1);
232    let max_child_only_positions = MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP + 1;
233
234    container_only_positions.len() <= max_container_only_positions
235        && container_only_boundary_positions <= max_container_boundary_positions
236        && child_only_positions.len() <= max_child_only_positions
237}
238
239fn filter_redundant_same_expression_seq_containers(
240    seq_matches: Vec<LicenseMatch>,
241    candidate_contained_matches: &[LicenseMatch],
242) -> Vec<LicenseMatch> {
243    seq_matches
244        .into_iter()
245        .filter(|m| !is_redundant_same_expression_seq_container(m, candidate_contained_matches))
246        .collect()
247}
248
249fn is_redundant_low_coverage_composite_seq_wrapper(
250    container: &LicenseMatch,
251    candidate_contained_matches: &[LicenseMatch],
252) -> bool {
253    if container.matcher != seq_match::MATCH_SEQ || container.coverage() >= 30.0 {
254        return false;
255    }
256
257    let container_qspan_set = container.qspan_set();
258
259    let children: Vec<&LicenseMatch> = candidate_contained_matches
260        .iter()
261        .filter(|m| {
262            m.matcher == aho_match::MATCH_AHO
263                && has_full_match_coverage(m)
264                && m.license_expression != container.license_expression
265                && m.overlaps_with(&container_qspan_set)
266        })
267        .collect();
268
269    if children.len() < 2 {
270        return false;
271    }
272
273    let unique_expressions: HashSet<&str> = children
274        .iter()
275        .map(|m| m.license_expression.as_str())
276        .collect();
277    if unique_expressions.len() < 2 {
278        return false;
279    }
280
281    let mut child_union = PositionSet::new();
282    for m in &children {
283        child_union.extend_from_span(m.query_span());
284    }
285
286    let container_only_positions = container_qspan_set.difference(&child_union);
287    let child_only_positions = child_union.difference(&container_qspan_set);
288
289    let mut sorted_children = children;
290    sorted_children.sort_by_key(|m| m.qspan_bounds());
291
292    let mut bridge_positions = BitSet::new();
293    for pair in sorted_children.windows(2) {
294        let (_, previous_end) = pair[0].qspan_bounds();
295        let (next_start, _) = pair[1].qspan_bounds();
296        for pos in previous_end..next_start {
297            bridge_positions.insert(pos);
298        }
299    }
300
301    let container_only_boundary_positions = container_only_positions
302        .iter()
303        .filter(|&pos| !bridge_positions.contains(pos))
304        .count();
305
306    child_only_positions.is_empty()
307        && container_only_positions.len() <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
308        && container_only_boundary_positions <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
309}
310
311fn filter_redundant_low_coverage_composite_seq_wrappers(
312    seq_matches: Vec<LicenseMatch>,
313    candidate_contained_matches: &[LicenseMatch],
314) -> Vec<LicenseMatch> {
315    seq_matches
316        .into_iter()
317        .filter(|m| {
318            !is_redundant_low_coverage_composite_seq_wrapper(m, candidate_contained_matches)
319        })
320        .collect()
321}
322
323fn subtract_spdx_match_qspans(
324    query: &mut Query<'_>,
325    matched_qspans: &mut Vec<models::PositionSpan>,
326    aho_extra_matchables: &mut PositionSet,
327    spdx_matches: &[LicenseMatch],
328) {
329    for m in spdx_matches {
330        let Some(span) = query_span_for_match(m) else {
331            continue;
332        };
333
334        aho_extra_matchables.extend_from_span(&span);
335        query.subtract(&span);
336
337        if has_full_match_coverage(m) {
338            matched_qspans.push(span);
339        }
340    }
341}
342
343fn merge_and_prepare_aho_matches(
344    index: &index::LicenseIndex,
345    query: &mut Query<'_>,
346    matched_qspans: &mut Vec<models::PositionSpan>,
347    refined_aho: &[LicenseMatch],
348) -> (Vec<LicenseMatch>, bool) {
349    let merged_aho = merge_overlapping_matches(refined_aho);
350    let mut saw_long_exact_license_text_match = false;
351
352    for m in &merged_aho {
353        let Some(span) = query_span_for_match(m) else {
354            continue;
355        };
356
357        if has_full_match_coverage(m) {
358            matched_qspans.push(span.clone());
359        }
360
361        if index
362            .rules_by_rid
363            .get(m.rid)
364            .is_some_and(|rule| rule.is_license_text())
365            && m.rule_length > 120
366            && m.coverage() > 98.0
367        {
368            query.subtract(&span);
369            saw_long_exact_license_text_match = true;
370        }
371    }
372
373    (merged_aho, saw_long_exact_license_text_match)
374}
375
376fn collect_whole_query_exact_followup_matches(
377    index: &index::LicenseIndex,
378    query: &mut Query<'_>,
379    matched_qspans: &mut Vec<models::PositionSpan>,
380    whole_run: &query::QueryRun<'_>,
381) -> Vec<LicenseMatch> {
382    let mut seq_all_matches = Vec::new();
383
384    if whole_run.is_matchable(false, matched_qspans) {
385        let near_dupe_candidates =
386            select_seq_candidates(index, whole_run, true, MAX_NEAR_DUPE_CANDIDATES);
387
388        if !near_dupe_candidates.is_empty() {
389            let near_dupe_matches =
390                seq_match_with_candidates(index, whole_run, &near_dupe_candidates);
391
392            for m in &near_dupe_matches {
393                if !m.query_span().is_empty() {
394                    let span = m.query_span().clone();
395                    query.subtract(&span);
396                    matched_qspans.push(span);
397                }
398            }
399
400            seq_all_matches.extend(near_dupe_matches);
401        }
402    }
403
404    seq_all_matches
405}
406
407fn collect_regular_seq_matches(
408    index: &index::LicenseIndex,
409    query: &Query<'_>,
410    matched_qspans: &[models::PositionSpan],
411    candidate_contained_matches: &[LicenseMatch],
412) -> Vec<LicenseMatch> {
413    let mut seq_all_matches = Vec::new();
414
415    for query_run in query.query_runs() {
416        if !query_run.is_matchable(false, matched_qspans) {
417            continue;
418        }
419
420        let candidates =
421            select_seq_candidates(index, &query_run, false, MAX_REGULAR_SEQ_CANDIDATES);
422        if !candidates.is_empty() {
423            let matches = seq_match_with_candidates(index, &query_run, &candidates);
424            seq_all_matches.extend(matches);
425        }
426    }
427
428    let merged_seq = merge_overlapping_matches(&seq_all_matches);
429    let filtered_same_expression =
430        filter_redundant_same_expression_seq_containers(merged_seq, candidate_contained_matches);
431    filter_redundant_low_coverage_composite_seq_wrappers(
432        filtered_same_expression,
433        candidate_contained_matches,
434    )
435}
436
437impl LicenseDetectionEngine {
438    /// Create a new license detection engine from a pre-built license index.
439    ///
440    /// This is an internal constructor used by `from_directory()` and `from_embedded()`.
441    /// It builds the SPDX mapping from the licenses in the index.
442    fn from_index(
443        index: index::LicenseIndex,
444        spdx_license_list_version: Option<String>,
445    ) -> Result<Self> {
446        let mut license_vec: Vec<_> = index.licenses_by_key.values().cloned().collect();
447        license_vec.sort_by(|a, b| a.key.cmp(&b.key));
448        let spdx_mapping = build_spdx_mapping(&license_vec);
449
450        Ok(Self {
451            index: Arc::new(index),
452            spdx_mapping,
453            spdx_license_list_version,
454        })
455    }
456
457    /// Create a new license detection engine from the embedded license index.
458    ///
459    /// This method loads the build-time embedded license artifact and constructs
460    /// the runtime license index. This eliminates the runtime dependency on the
461    /// ScanCode rules directory.
462    ///
463    /// # Returns
464    /// A Result containing the engine or an error
465    pub fn from_embedded() -> Result<Self> {
466        let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
467        let loaded = load_embedded_license_index_from_bytes(artifact_bytes)
468            .map_err(|e| anyhow::anyhow!("Failed to load embedded license index: {}", e))?;
469        Self::from_index(
470            loaded.index,
471            Some(loaded.metadata.spdx_license_list_version),
472        )
473    }
474
475    /// Create a new license detection engine from a directory of license rules.
476    ///
477    /// # Arguments
478    /// * `rules_path` - Path to directory containing .LICENSE and .RULE files
479    ///
480    /// # Returns
481    /// A Result containing the engine or an error
482    pub fn from_directory(rules_path: &Path) -> Result<Self> {
483        let (rules_dir, licenses_dir) = if rules_path.ends_with("data") {
484            (rules_path.join("rules"), rules_path.join("licenses"))
485        } else if rules_path.ends_with("rules") {
486            let parent = rules_path.parent().ok_or_else(|| {
487                anyhow::anyhow!("Cannot determine parent directory for rules path")
488            })?;
489            (rules_path.to_path_buf(), parent.join("licenses"))
490        } else {
491            (rules_path.to_path_buf(), rules_path.to_path_buf())
492        };
493
494        let loaded_rules = load_loaded_rules_from_directory(&rules_dir)?;
495        let loaded_licenses = load_loaded_licenses_from_directory(&licenses_dir)?;
496        let index = build_index_from_loaded(loaded_rules, loaded_licenses, false);
497        let spdx_license_list_version = detect_scancode_spdx_license_list_version(&rules_dir)?;
498
499        Self::from_index(index, spdx_license_list_version)
500    }
501
502    pub fn embedded_spdx_license_list_version() -> Result<String> {
503        let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
504        Ok(load_embedded_artifact_metadata_from_bytes(artifact_bytes)
505            .map_err(|e| {
506                anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
507            })?
508            .spdx_license_list_version)
509    }
510
511    pub fn detect_with_kind(
512        &self,
513        text: &str,
514        unknown_licenses: bool,
515        binary_derived: bool,
516    ) -> Result<Vec<LicenseDetection>> {
517        self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, 0.0)
518    }
519
520    pub fn detect_with_kind_with_score(
521        &self,
522        text: &str,
523        unknown_licenses: bool,
524        binary_derived: bool,
525        min_score: f32,
526    ) -> Result<Vec<LicenseDetection>> {
527        let clean_text = strip_utf8_bom_str(text);
528
529        let content = truncate_detection_text(clean_text);
530
531        let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
532        let whole_query_run = query.whole_query_run();
533
534        let mut all_matches = Vec::new();
535        let mut candidate_contained_matches = Vec::new();
536        let mut aho_extra_matchables = PositionSet::new();
537        let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
538
539        // Phase 1a: Hash matching
540        // Python returns immediately if hash matches found (index.py:987-991)
541        {
542            let hash_matches = hash_match(&self.index, &whole_query_run);
543
544            if !hash_matches.is_empty() {
545                let mut matches = hash_matches;
546                sort_matches_by_line(&mut matches);
547
548                let groups = group_matches_by_region(&matches);
549                let detections: Vec<LicenseDetection> = groups
550                    .iter()
551                    .map(|group| {
552                        let mut detection = empty_detection();
553                        populate_detection_from_group_with_spdx(
554                            &mut detection,
555                            group,
556                            &self.spdx_mapping,
557                            Some(content),
558                        );
559                        detection
560                    })
561                    .collect();
562
563                return Ok(post_process_detections(detections, min_score));
564            }
565        }
566
567        // Phase 1b: SPDX-LID matching
568        {
569            let spdx_matches = spdx_lid_match(&self.index, &query);
570            subtract_spdx_match_qspans(
571                &mut query,
572                &mut matched_qspans,
573                &mut aho_extra_matchables,
574                &spdx_matches,
575            );
576            all_matches.extend(spdx_matches);
577        }
578
579        // Phase 1c: Aho-Corasick matching
580        {
581            let aho_matches = if aho_extra_matchables.is_empty() {
582                aho_match(&self.index, &whole_query_run)
583            } else {
584                aho_match::aho_match_with_extra_matchables(
585                    &self.index,
586                    &whole_query_run,
587                    Some(&aho_extra_matchables),
588                )
589            };
590
591            // Python's get_exact_matches() calls refine_matches with merge=False
592            // This applies quality filters including required phrase filtering
593            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
594            candidate_contained_matches.extend(refined_aho.clone());
595            let (merged_aho, _) = merge_and_prepare_aho_matches(
596                &self.index,
597                &mut query,
598                &mut matched_qspans,
599                &refined_aho,
600            );
601            all_matches.extend(merged_aho);
602
603            let whole_query_followup = collect_whole_query_exact_followup_matches(
604                &self.index,
605                &mut query,
606                &mut matched_qspans,
607                &whole_query_run,
608            );
609            all_matches.extend(whole_query_followup);
610
611            let merged_seq = collect_regular_seq_matches(
612                &self.index,
613                &query,
614                &matched_qspans,
615                &candidate_contained_matches,
616            );
617            all_matches.extend(merged_seq);
618        }
619
620        // Step 1: Initial refine WITHOUT false positive filtering
621        // Python: refine_matches with filter_false_positive=False (index.py:1073-1080)
622        let merged_matches =
623            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
624
625        // Step 2: Unknown detection and weak match handling
626        // Python: index.py:1079-1118 - only runs when unknown_licenses=True
627        let refined_matches = if unknown_licenses {
628            // Split weak from good - Python: index.py:1083
629            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
630
631            // Unknown detection on uncovered regions - Python: index.py:1093-1114
632            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
633            let filtered_unknown =
634                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
635
636            let mut all_matches = good_matches;
637            all_matches.extend(filtered_unknown);
638            // reinject weak matches and let refine matches keep the bests
639            // Python: index.py:1117-1118
640            all_matches.extend(weak_matches);
641            all_matches
642        } else {
643            merged_matches
644        };
645
646        // Step 5: Final refine WITH false positive filtering - Python: index.py:1130-1145
647        let refined = refine_matches(&self.index, refined_matches, &query);
648
649        let mut sorted = refined;
650        sort_matches_by_line(&mut sorted);
651
652        let groups = group_matches_by_region(&sorted);
653
654        let detections: Vec<LicenseDetection> = groups
655            .iter()
656            .map(|group| {
657                let mut detection = empty_detection();
658                populate_detection_from_group_with_spdx(
659                    &mut detection,
660                    group,
661                    &self.spdx_mapping,
662                    Some(content),
663                );
664                detection
665            })
666            .collect();
667
668        let detections = post_process_detections(detections, min_score);
669
670        Ok(detections)
671    }
672
673    pub fn detect_with_kind_and_source(
674        &self,
675        text: &str,
676        unknown_licenses: bool,
677        binary_derived: bool,
678        source_path: &str,
679    ) -> Result<Vec<LicenseDetection>> {
680        let mut detections = self.detect_with_kind(text, unknown_licenses, binary_derived)?;
681        attach_source_path_to_detections(&mut detections, source_path);
682        Ok(detections)
683    }
684
685    pub fn detect_with_kind_and_source_with_score(
686        &self,
687        text: &str,
688        unknown_licenses: bool,
689        binary_derived: bool,
690        source_path: &str,
691        min_score: f32,
692    ) -> Result<Vec<LicenseDetection>> {
693        let mut detections =
694            self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, min_score)?;
695        attach_source_path_to_detections(&mut detections, source_path);
696        Ok(detections)
697    }
698
699    /// Detect licenses and return raw matches (like Python's idx.match()).
700    ///
701    /// This is primarily used by golden tests and maintenance tooling that need
702    /// raw match sequences before grouping or post-processing into detections.
703    #[cfg(any(test, feature = "golden-tests"))]
704    pub fn detect_matches_with_kind(
705        &self,
706        text: &str,
707        unknown_licenses: bool,
708        binary_derived: bool,
709    ) -> Result<Vec<LicenseMatch>> {
710        let clean_text = strip_utf8_bom_str(text);
711
712        let content = truncate_detection_text(clean_text);
713
714        let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
715        let whole_query_run = query.whole_query_run();
716
717        let mut all_matches = Vec::new();
718        let mut candidate_contained_matches = Vec::new();
719        let mut aho_extra_matchables = PositionSet::new();
720        let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
721
722        // Phase 1a: Hash matching
723        {
724            let hash_matches = hash_match(&self.index, &whole_query_run);
725
726            if !hash_matches.is_empty() {
727                let mut matches = hash_matches;
728                sort_matches_by_line(&mut matches);
729                return Ok(matches);
730            }
731        }
732
733        // Phase 1b: SPDX-LID matching
734        {
735            let spdx_matches = spdx_lid_match(&self.index, &query);
736            subtract_spdx_match_qspans(
737                &mut query,
738                &mut matched_qspans,
739                &mut aho_extra_matchables,
740                &spdx_matches,
741            );
742            all_matches.extend(spdx_matches);
743        }
744
745        // Phase 1c: Aho-Corasick matching
746        {
747            let aho_matches = if aho_extra_matchables.is_empty() {
748                aho_match(&self.index, &whole_query_run)
749            } else {
750                aho_match::aho_match_with_extra_matchables(
751                    &self.index,
752                    &whole_query_run,
753                    Some(&aho_extra_matchables),
754                )
755            };
756            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
757            candidate_contained_matches.extend(refined_aho.clone());
758            let (merged_aho, _) = merge_and_prepare_aho_matches(
759                &self.index,
760                &mut query,
761                &mut matched_qspans,
762                &refined_aho,
763            );
764            all_matches.extend(merged_aho);
765
766            let whole_query_followup = collect_whole_query_exact_followup_matches(
767                &self.index,
768                &mut query,
769                &mut matched_qspans,
770                &whole_query_run,
771            );
772            all_matches.extend(whole_query_followup);
773
774            let merged_seq = collect_regular_seq_matches(
775                &self.index,
776                &query,
777                &matched_qspans,
778                &candidate_contained_matches,
779            );
780            all_matches.extend(merged_seq);
781        }
782
783        // Step 1: Initial refine WITHOUT false positive filtering
784        let merged_matches =
785            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
786
787        // Step 2: Unknown detection and weak match handling
788        let refined_matches = if unknown_licenses {
789            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
790            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
791            let filtered_unknown =
792                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
793
794            let mut all_matches = good_matches;
795            all_matches.extend(filtered_unknown);
796            all_matches.extend(weak_matches);
797            all_matches
798        } else {
799            merged_matches
800        };
801
802        // Step 3: Final refine WITH false positive filtering - Python: index.py:1130-1145
803        let refined = refine_matches(&self.index, refined_matches, &query);
804
805        let mut sorted = refined;
806        sort_matches_by_line(&mut sorted);
807
808        // Return raw matches (NOT grouped) - this is Python's idx.match() behavior
809        Ok(sorted)
810    }
811
812    /// Get a reference to the license index.
813    pub fn index(&self) -> &index::LicenseIndex {
814        &self.index
815    }
816
817    pub fn spdx_license_list_version(&self) -> Option<&str> {
818        self.spdx_license_list_version.as_deref()
819    }
820
821    /// Get a reference to the SPDX mapping.
822    #[cfg(test)]
823    pub fn spdx_mapping(&self) -> &SpdxMapping {
824        &self.spdx_mapping
825    }
826}
827
828pub fn detect_scancode_spdx_license_list_version(search_path: &Path) -> Result<Option<String>> {
829    for ancestor in search_path.ancestors() {
830        let candidate = ancestor.join("scancode_config.py");
831        if candidate.is_file() {
832            let config = fs::read_to_string(&candidate)?;
833            return Ok(parse_scancode_spdx_license_list_version(&config));
834        }
835    }
836
837    Ok(None)
838}
839
840fn parse_scancode_spdx_license_list_version(config: &str) -> Option<String> {
841    config.lines().find_map(|line| {
842        let trimmed = line.trim();
843        let (_, value) = trimmed.split_once('=')?;
844        (trimmed.starts_with("spdx_license_list_version")).then(|| {
845            value
846                .trim()
847                .trim_matches('"')
848                .trim_matches('\'')
849                .to_string()
850        })
851    })
852}
853
854#[cfg(test)]
855mod tests;