Skip to main content

provenant/license_detection/
mod.rs

1//! License Detection Engine
2
3pub mod aho_match;
4pub mod automaton;
5pub(crate) mod detection;
6pub mod embedded;
7pub mod license_cache;
8mod position_set;
9mod token_multiset;
10mod token_set;
11
12#[cfg(test)]
13mod embedded_test;
14pub mod expression;
15#[cfg(all(test, feature = "golden-tests"))]
16mod golden_test;
17#[cfg(feature = "golden-tests")]
18pub mod golden_utils;
19pub mod hash_match;
20pub mod index;
21mod match_refine;
22pub mod models;
23pub mod query;
24pub mod rules;
25pub mod seq_match;
26pub mod spdx_lid;
27pub mod spdx_mapping;
28#[cfg(test)]
29mod test_utils;
30pub mod tokenize;
31pub mod unknown_match;
32
33use bit_set::BitSet;
34use std::collections::HashSet;
35use std::fs;
36use std::path::Path;
37use std::sync::Arc;
38use std::time::Instant;
39
40use anyhow::Result;
41
42use crate::license_detection::embedded::index::{
43    load_embedded_artifact_metadata_from_bytes, load_loader_snapshot_from_bytes,
44};
45use crate::license_detection::index::CachedLicenseIndex;
46use crate::license_detection::index::build_index_from_loaded;
47use crate::license_detection::license_cache::{
48    LicenseCacheConfig, LicenseCacheNamespace, cache_file_size, compute_artifact_fingerprint,
49    compute_rules_fingerprint, delete_cache, load_cached_index, save_cached_index,
50};
51use crate::license_detection::query::Query;
52use crate::license_detection::rules::{
53    load_loaded_licenses_from_directory, load_loaded_rules_from_directory,
54};
55use crate::license_detection::spdx_mapping::{SpdxMapping, build_spdx_mapping};
56use crate::utils::text::strip_utf8_bom_str;
57
58use crate::license_detection::detection::{
59    attach_source_path_to_detections, empty_detection, populate_detection_from_group_with_spdx,
60};
61use crate::license_detection::models::MatcherKind;
62
63/// Path to the license rules directory in the reference scancode-toolkit submodule.
64/// Used by test code and the xtask generate-license-loader-artifact binary.
65#[allow(dead_code)]
66pub const SCANCODE_LICENSES_RULES_PATH: &str =
67    "reference/scancode-toolkit/src/licensedcode/data/rules";
68
69/// Path to the licenses directory in the reference scancode-toolkit submodule.
70/// Used by test code and the xtask generate-license-loader-artifact binary.
71#[allow(dead_code)]
72pub const SCANCODE_LICENSES_LICENSES_PATH: &str =
73    "reference/scancode-toolkit/src/licensedcode/data/licenses";
74
75/// Path to the license data directory in the reference scancode-toolkit submodule.
76/// Used by test code and the xtask generate-license-loader-artifact binary.
77#[allow(dead_code)]
78pub const SCANCODE_LICENSES_DATA_PATH: &str = "reference/scancode-toolkit/src/licensedcode/data";
79
80pub const DEFAULT_LICENSEDB_URL_TEMPLATE: &str = "https://scancode-licensedb.aboutcode.org/{}";
81
82pub(crate) use detection::{
83    LicenseDetection, group_matches_by_region, post_process_detections, sort_matches_by_line,
84};
85pub use models::LicenseMatch;
86
87pub use aho_match::aho_match;
88pub use hash_match::hash_match;
89pub use match_refine::{
90    filter_invalid_contained_unknown_matches, merge_overlapping_matches, refine_matches,
91    refine_matches_without_false_positive_filter, split_weak_matches,
92};
93pub use position_set::PositionSet;
94pub use spdx_lid::spdx_lid_match;
95pub use token_multiset::TokenMultiset;
96pub use token_set::TokenSet;
97pub use unknown_match::unknown_match;
98
99use self::seq_match::{MAX_NEAR_DUPE_CANDIDATES, select_seq_candidates, seq_match_with_candidates};
100
101/// License detection engine that orchestrates the detection pipeline.
102///
103/// The engine loads license rules and builds an index for efficient matching.
104/// It supports multiple matching strategies (hash, SPDX-LID, Aho-Corasick, sequence)
105/// and combines their results into final license detections.
106#[derive(Debug, Clone)]
107pub struct LicenseDetectionEngine {
108    index: Arc<index::LicenseIndex>,
109    spdx_mapping: SpdxMapping,
110    spdx_license_list_version: Option<String>,
111}
112
113const MAX_DETECTION_SIZE: usize = 10 * 1024 * 1024; // 10MB
114const MAX_REGULAR_SEQ_CANDIDATES: usize = 70;
115const MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP: usize = 8;
116const MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP: usize = 2;
117
118fn truncate_detection_text(clean_text: &str) -> &str {
119    if clean_text.len() <= MAX_DETECTION_SIZE {
120        return clean_text;
121    }
122
123    log::debug!(
124        "Content size {} exceeds limit {}, truncating for detection",
125        clean_text.len(),
126        MAX_DETECTION_SIZE
127    );
128
129    let boundary = clean_text.floor_char_boundary(MAX_DETECTION_SIZE);
130    &clean_text[..boundary]
131}
132
133fn query_span_for_match(m: &LicenseMatch) -> Option<models::PositionSpan> {
134    (!m.query_span().is_empty()).then(|| m.query_span().clone())
135}
136
137fn has_full_match_coverage(m: &LicenseMatch) -> bool {
138    m.coverage() == 100.0
139}
140
141fn is_redundant_same_expression_seq_container(
142    container: &LicenseMatch,
143    candidate_contained_matches: &[LicenseMatch],
144) -> bool {
145    let container_is_redundant_coverage =
146        has_full_match_coverage(container) || container.coverage() >= 99.0;
147    if container.matcher != MatcherKind::Seq || !container_is_redundant_coverage {
148        return false;
149    }
150
151    let container_qspan_set = container.qspan_set();
152
153    let mut contained: Vec<&LicenseMatch> = candidate_contained_matches
154        .iter()
155        .filter(|m| {
156            m.matcher == MatcherKind::Aho
157                && has_full_match_coverage(m)
158                && m.license_expression == container.license_expression
159                && m.overlaps_with(&container_qspan_set)
160        })
161        .collect();
162
163    if contained.len() < 2 {
164        return false;
165    }
166
167    let material_children = contained.iter().filter(|m| m.matched_length > 1).count();
168    if material_children < 2 {
169        return false;
170    }
171
172    contained.sort_by_key(|m| m.qspan_bounds());
173
174    let mut child_union = PositionSet::new();
175    for m in &contained {
176        child_union.extend_from_span(m.query_span());
177    }
178
179    let container_only_positions = container_qspan_set.difference(&child_union);
180    let child_only_positions = child_union.difference(&container_qspan_set);
181
182    let mut bridge_positions = BitSet::new();
183    for pair in contained.windows(2) {
184        let (_, previous_end) = pair[0].qspan_bounds();
185        let (next_start, _) = pair[1].qspan_bounds();
186
187        if next_start < previous_end {
188            return false;
189        }
190
191        for pos in previous_end..next_start {
192            bridge_positions.insert(pos);
193        }
194    }
195
196    let container_only_boundary_positions = container_only_positions
197        .iter()
198        .filter(|&pos| !bridge_positions.contains(pos))
199        .count();
200
201    if container_only_positions.len() == 1
202        && container_only_boundary_positions == 0
203        && child_only_positions.is_empty()
204    {
205        return false;
206    }
207
208    if child_only_positions.is_empty()
209        && container_only_positions.len() == container_only_boundary_positions
210        && container_only_boundary_positions <= 3
211    {
212        let earliest_child = contained
213            .iter()
214            .map(|m| m.qspan_bounds().0)
215            .min()
216            .unwrap_or(usize::MAX);
217        let latest_child = contained
218            .iter()
219            .map(|m| m.qspan_bounds().1.saturating_sub(1))
220            .max()
221            .unwrap_or(0);
222
223        let is_one_sided_boundary = container_only_positions
224            .iter()
225            .all(|pos| pos < earliest_child)
226            || container_only_positions
227                .iter()
228                .all(|pos| pos > latest_child);
229
230        if is_one_sided_boundary {
231            return false;
232        }
233    }
234
235    let max_container_only_positions =
236        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * contained.len() + 1;
237    let max_container_boundary_positions =
238        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * (contained.len() - 1);
239    let max_child_only_positions = MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP + 1;
240
241    container_only_positions.len() <= max_container_only_positions
242        && container_only_boundary_positions <= max_container_boundary_positions
243        && child_only_positions.len() <= max_child_only_positions
244}
245
246fn filter_redundant_same_expression_seq_containers(
247    seq_matches: Vec<LicenseMatch>,
248    candidate_contained_matches: &[LicenseMatch],
249) -> Vec<LicenseMatch> {
250    seq_matches
251        .into_iter()
252        .filter(|m| !is_redundant_same_expression_seq_container(m, candidate_contained_matches))
253        .collect()
254}
255
256fn is_redundant_low_coverage_composite_seq_wrapper(
257    container: &LicenseMatch,
258    candidate_contained_matches: &[LicenseMatch],
259) -> bool {
260    if container.matcher != seq_match::MATCH_SEQ || container.coverage() >= 30.0 {
261        return false;
262    }
263
264    let container_qspan_set = container.qspan_set();
265
266    let children: Vec<&LicenseMatch> = candidate_contained_matches
267        .iter()
268        .filter(|m| {
269            m.matcher == aho_match::MATCH_AHO
270                && has_full_match_coverage(m)
271                && m.license_expression != container.license_expression
272                && m.overlaps_with(&container_qspan_set)
273        })
274        .collect();
275
276    if children.len() < 2 {
277        return false;
278    }
279
280    let unique_expressions: HashSet<&str> = children
281        .iter()
282        .map(|m| m.license_expression.as_str())
283        .collect();
284    if unique_expressions.len() < 2 {
285        return false;
286    }
287
288    let mut child_union = PositionSet::new();
289    for m in &children {
290        child_union.extend_from_span(m.query_span());
291    }
292
293    let container_only_positions = container_qspan_set.difference(&child_union);
294    let child_only_positions = child_union.difference(&container_qspan_set);
295
296    let mut sorted_children = children;
297    sorted_children.sort_by_key(|m| m.qspan_bounds());
298
299    let mut bridge_positions = BitSet::new();
300    for pair in sorted_children.windows(2) {
301        let (_, previous_end) = pair[0].qspan_bounds();
302        let (next_start, _) = pair[1].qspan_bounds();
303        for pos in previous_end..next_start {
304            bridge_positions.insert(pos);
305        }
306    }
307
308    let container_only_boundary_positions = container_only_positions
309        .iter()
310        .filter(|&pos| !bridge_positions.contains(pos))
311        .count();
312
313    child_only_positions.is_empty()
314        && container_only_positions.len() <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
315        && container_only_boundary_positions <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
316}
317
318fn filter_redundant_low_coverage_composite_seq_wrappers(
319    seq_matches: Vec<LicenseMatch>,
320    candidate_contained_matches: &[LicenseMatch],
321) -> Vec<LicenseMatch> {
322    seq_matches
323        .into_iter()
324        .filter(|m| {
325            !is_redundant_low_coverage_composite_seq_wrapper(m, candidate_contained_matches)
326        })
327        .collect()
328}
329
330fn subtract_spdx_match_qspans(
331    query: &mut Query<'_>,
332    matched_qspans: &mut Vec<models::PositionSpan>,
333    aho_extra_matchables: &mut PositionSet,
334    spdx_matches: &[LicenseMatch],
335) {
336    for m in spdx_matches {
337        let Some(span) = query_span_for_match(m) else {
338            continue;
339        };
340
341        aho_extra_matchables.extend_from_span(&span);
342        query.subtract(&span);
343
344        if has_full_match_coverage(m) {
345            matched_qspans.push(span);
346        }
347    }
348}
349
350fn merge_and_prepare_aho_matches(
351    index: &index::LicenseIndex,
352    query: &mut Query<'_>,
353    matched_qspans: &mut Vec<models::PositionSpan>,
354    refined_aho: &[LicenseMatch],
355) -> (Vec<LicenseMatch>, bool) {
356    let merged_aho = merge_overlapping_matches(refined_aho);
357    let mut saw_long_exact_license_text_match = false;
358
359    for m in &merged_aho {
360        let Some(span) = query_span_for_match(m) else {
361            continue;
362        };
363
364        if has_full_match_coverage(m) {
365            matched_qspans.push(span.clone());
366        }
367
368        if index
369            .rules_by_rid
370            .get(m.rid)
371            .is_some_and(|rule| rule.is_license_text())
372            && m.rule_length > 120
373            && m.coverage() > 98.0
374        {
375            query.subtract(&span);
376            saw_long_exact_license_text_match = true;
377        }
378    }
379
380    (merged_aho, saw_long_exact_license_text_match)
381}
382
383fn collect_whole_query_exact_followup_matches(
384    index: &index::LicenseIndex,
385    query: &mut Query<'_>,
386    matched_qspans: &mut Vec<models::PositionSpan>,
387    whole_run: &query::QueryRun<'_>,
388) -> Vec<LicenseMatch> {
389    let mut seq_all_matches = Vec::new();
390
391    if whole_run.is_matchable(false, matched_qspans) {
392        let near_dupe_candidates =
393            select_seq_candidates(index, whole_run, true, MAX_NEAR_DUPE_CANDIDATES);
394
395        if !near_dupe_candidates.is_empty() {
396            let near_dupe_matches =
397                seq_match_with_candidates(index, whole_run, &near_dupe_candidates);
398
399            for m in &near_dupe_matches {
400                if !m.query_span().is_empty() {
401                    let span = m.query_span().clone();
402                    query.subtract(&span);
403                    matched_qspans.push(span);
404                }
405            }
406
407            seq_all_matches.extend(near_dupe_matches);
408        }
409    }
410
411    seq_all_matches
412}
413
414fn collect_regular_seq_matches(
415    index: &index::LicenseIndex,
416    query: &Query<'_>,
417    matched_qspans: &[models::PositionSpan],
418    candidate_contained_matches: &[LicenseMatch],
419) -> Vec<LicenseMatch> {
420    let mut seq_all_matches = Vec::new();
421
422    for query_run in query.query_runs() {
423        if !query_run.is_matchable(false, matched_qspans) {
424            continue;
425        }
426
427        let candidates =
428            select_seq_candidates(index, &query_run, false, MAX_REGULAR_SEQ_CANDIDATES);
429        if !candidates.is_empty() {
430            let matches = seq_match_with_candidates(index, &query_run, &candidates);
431            seq_all_matches.extend(matches);
432        }
433    }
434
435    let merged_seq = merge_overlapping_matches(&seq_all_matches);
436    let filtered_same_expression =
437        filter_redundant_same_expression_seq_containers(merged_seq, candidate_contained_matches);
438    filter_redundant_low_coverage_composite_seq_wrappers(
439        filtered_same_expression,
440        candidate_contained_matches,
441    )
442}
443
444impl LicenseDetectionEngine {
445    /// Create a new license detection engine from a pre-built license index.
446    ///
447    /// This is an internal constructor used by `from_directory()` and `from_embedded()`.
448    /// It builds the SPDX mapping from the licenses in the index.
449    fn from_index(
450        index: index::LicenseIndex,
451        spdx_license_list_version: Option<String>,
452    ) -> Result<Self> {
453        let mut license_vec: Vec<_> = index.licenses_by_key.values().cloned().collect();
454        license_vec.sort_by(|a, b| a.key.cmp(&b.key));
455        let spdx_mapping = build_spdx_mapping(&license_vec);
456
457        Ok(Self {
458            index: Arc::new(index),
459            spdx_mapping,
460            spdx_license_list_version,
461        })
462    }
463
464    /// Create a new license detection engine from the embedded license index.
465    ///
466    /// Convenience method that uses the default Provenant cache root and does
467    /// not force a reindex.
468    pub fn from_embedded() -> Result<Self> {
469        let cache_config =
470            LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
471        Self::from_embedded_with_cache(&cache_config)
472    }
473
474    /// Create a new license detection engine from the embedded license index.
475    ///
476    /// This method loads the build-time embedded license artifact and constructs
477    /// the runtime license index. This eliminates the runtime dependency on the
478    /// ScanCode rules directory.
479    ///
480    /// If a valid cache exists (matching fingerprint), the index is loaded from
481    /// the rkyv cache file instead of being rebuilt from scratch.
482    ///
483    /// # Arguments
484    /// * `cache_config` - Cache configuration (directory and reindex flag)
485    ///
486    /// # Returns
487    /// A Result containing the engine or an error
488    pub fn from_embedded_with_cache(cache_config: &LicenseCacheConfig) -> Result<Self> {
489        let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
490        let fingerprint = compute_artifact_fingerprint(artifact_bytes);
491
492        if !cache_config.reindex {
493            if let Some(cached) =
494                load_cached_index(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?
495            {
496                let start = Instant::now();
497                let spdx_version = cached.spdx_license_list_version.clone();
498                let index = index::LicenseIndex::from(cached);
499                eprintln!(
500                    "License index loaded from rkyv cache in {:.2}s",
501                    start.elapsed().as_secs_f64()
502                );
503                return Self::from_index(index, spdx_version);
504            }
505        } else {
506            delete_cache(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?;
507        }
508
509        let snapshot = load_loader_snapshot_from_bytes(artifact_bytes)
510            .map_err(|e| anyhow::anyhow!("Failed to load embedded license index: {}", e))?;
511        let spdx_version = Some(snapshot.metadata.spdx_license_list_version.clone());
512
513        let start = Instant::now();
514        let index = build_index_from_loaded(snapshot.rules, snapshot.licenses, false);
515        eprintln!(
516            "License index built from embedded artifact in {:.2}s",
517            start.elapsed().as_secs_f64()
518        );
519
520        let mut cached = CachedLicenseIndex::from(index.clone());
521        cached.spdx_license_list_version = spdx_version.clone();
522        if let Err(e) = save_cached_index(
523            cache_config,
524            LicenseCacheNamespace::Embedded,
525            &cached,
526            &fingerprint,
527        ) {
528            eprintln!("Warning: failed to save license index cache: {}", e);
529        } else if let Some(size) =
530            cache_file_size(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)
531        {
532            eprintln!(
533                "License index cache saved ({:.1} MB)",
534                size as f64 / 1_048_576.0
535            );
536        }
537
538        Self::from_index(index, spdx_version)
539    }
540
541    /// Create a new license detection engine from a directory of license rules.
542    ///
543    /// Convenience method that uses the default Provenant cache root and does
544    /// not force a reindex.
545    pub fn from_directory(rules_path: &Path) -> Result<Self> {
546        let cache_config =
547            LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
548        Self::from_directory_with_cache(rules_path, &cache_config)
549    }
550
551    /// Create a new license detection engine from a directory of license rules.
552    ///
553    /// If a valid cache exists (matching fingerprint of the rules), the index is
554    /// loaded from the rkyv cache file instead of being rebuilt from scratch.
555    ///
556    /// # Arguments
557    /// * `rules_path` - Path to directory containing .LICENSE and .RULE files
558    /// * `cache_config` - Cache configuration (directory and reindex flag)
559    ///
560    /// # Returns
561    /// A Result containing the engine or an error
562    pub fn from_directory_with_cache(
563        rules_path: &Path,
564        cache_config: &LicenseCacheConfig,
565    ) -> Result<Self> {
566        let (rules_dir, licenses_dir) = if rules_path.ends_with("data") {
567            (rules_path.join("rules"), rules_path.join("licenses"))
568        } else if rules_path.ends_with("rules") {
569            let parent = rules_path.parent().ok_or_else(|| {
570                anyhow::anyhow!("Cannot determine parent directory for rules path")
571            })?;
572            (rules_path.to_path_buf(), parent.join("licenses"))
573        } else {
574            (rules_path.to_path_buf(), rules_path.to_path_buf())
575        };
576
577        let loaded_rules = load_loaded_rules_from_directory(&rules_dir)?;
578        let loaded_licenses = load_loaded_licenses_from_directory(&licenses_dir)?;
579
580        let fingerprint = compute_rules_fingerprint(&loaded_rules, &loaded_licenses);
581
582        if !cache_config.reindex {
583            if let Some(cached) = load_cached_index(
584                cache_config,
585                LicenseCacheNamespace::CustomRules,
586                &fingerprint,
587            )? {
588                let start = Instant::now();
589                let index = index::LicenseIndex::from(cached);
590                eprintln!(
591                    "License index loaded from rkyv cache in {:.2}s",
592                    start.elapsed().as_secs_f64()
593                );
594                let spdx_version = detect_scancode_spdx_license_list_version(&rules_dir)?;
595                return Self::from_index(index, spdx_version);
596            }
597        } else {
598            delete_cache(
599                cache_config,
600                LicenseCacheNamespace::CustomRules,
601                &fingerprint,
602            )?;
603        }
604
605        let start = Instant::now();
606        let index = build_index_from_loaded(loaded_rules, loaded_licenses, false);
607        eprintln!(
608            "License index built from rules directory in {:.2}s",
609            start.elapsed().as_secs_f64()
610        );
611
612        let spdx_license_list_version = detect_scancode_spdx_license_list_version(&rules_dir)?;
613
614        let cached = CachedLicenseIndex::from(index.clone());
615        if let Err(e) = save_cached_index(
616            cache_config,
617            LicenseCacheNamespace::CustomRules,
618            &cached,
619            &fingerprint,
620        ) {
621            eprintln!("Warning: failed to save license index cache: {}", e);
622        } else if let Some(size) = cache_file_size(
623            cache_config,
624            LicenseCacheNamespace::CustomRules,
625            &fingerprint,
626        ) {
627            eprintln!(
628                "License index cache saved ({:.1} MB)",
629                size as f64 / 1_048_576.0
630            );
631        }
632
633        Self::from_index(index, spdx_license_list_version)
634    }
635
636    pub fn embedded_spdx_license_list_version() -> Result<String> {
637        let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
638        Ok(load_embedded_artifact_metadata_from_bytes(artifact_bytes)
639            .map_err(|e| {
640                anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
641            })?
642            .spdx_license_list_version)
643    }
644
645    pub fn detect_with_kind(
646        &self,
647        text: &str,
648        unknown_licenses: bool,
649        binary_derived: bool,
650    ) -> Result<Vec<LicenseDetection>> {
651        self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, 0.0)
652    }
653
654    pub fn detect_with_kind_with_score(
655        &self,
656        text: &str,
657        unknown_licenses: bool,
658        binary_derived: bool,
659        min_score: f32,
660    ) -> Result<Vec<LicenseDetection>> {
661        let clean_text = strip_utf8_bom_str(text);
662
663        let content = truncate_detection_text(clean_text);
664
665        let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
666        let whole_query_run = query.whole_query_run();
667
668        let mut all_matches = Vec::new();
669        let mut candidate_contained_matches = Vec::new();
670        let mut aho_extra_matchables = PositionSet::new();
671        let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
672
673        // Phase 1a: Hash matching
674        // Python returns immediately if hash matches found (index.py:987-991)
675        {
676            let hash_matches = hash_match(&self.index, &whole_query_run);
677
678            if !hash_matches.is_empty() {
679                let mut matches = hash_matches;
680                sort_matches_by_line(&mut matches);
681
682                let groups = group_matches_by_region(&matches);
683                let detections: Vec<LicenseDetection> = groups
684                    .iter()
685                    .map(|group| {
686                        let mut detection = empty_detection();
687                        populate_detection_from_group_with_spdx(
688                            &mut detection,
689                            group,
690                            &self.spdx_mapping,
691                            Some(content),
692                        );
693                        detection
694                    })
695                    .collect();
696
697                return Ok(post_process_detections(detections, min_score));
698            }
699        }
700
701        // Phase 1b: SPDX-LID matching
702        {
703            let spdx_matches = spdx_lid_match(&self.index, &query);
704            subtract_spdx_match_qspans(
705                &mut query,
706                &mut matched_qspans,
707                &mut aho_extra_matchables,
708                &spdx_matches,
709            );
710            all_matches.extend(spdx_matches);
711        }
712
713        // Phase 1c: Aho-Corasick matching
714        {
715            let aho_matches = if aho_extra_matchables.is_empty() {
716                aho_match(&self.index, &whole_query_run)
717            } else {
718                aho_match::aho_match_with_extra_matchables(
719                    &self.index,
720                    &whole_query_run,
721                    Some(&aho_extra_matchables),
722                )
723            };
724
725            // Python's get_exact_matches() calls refine_matches with merge=False
726            // This applies quality filters including required phrase filtering
727            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
728            candidate_contained_matches.extend(refined_aho.clone());
729            let (merged_aho, _) = merge_and_prepare_aho_matches(
730                &self.index,
731                &mut query,
732                &mut matched_qspans,
733                &refined_aho,
734            );
735            all_matches.extend(merged_aho);
736
737            let whole_query_followup = collect_whole_query_exact_followup_matches(
738                &self.index,
739                &mut query,
740                &mut matched_qspans,
741                &whole_query_run,
742            );
743            all_matches.extend(whole_query_followup);
744
745            let merged_seq = collect_regular_seq_matches(
746                &self.index,
747                &query,
748                &matched_qspans,
749                &candidate_contained_matches,
750            );
751            all_matches.extend(merged_seq);
752        }
753
754        // Step 1: Initial refine WITHOUT false positive filtering
755        // Python: refine_matches with filter_false_positive=False (index.py:1073-1080)
756        let merged_matches =
757            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
758
759        // Step 2: Unknown detection and weak match handling
760        // Python: index.py:1079-1118 - only runs when unknown_licenses=True
761        let refined_matches = if unknown_licenses {
762            // Split weak from good - Python: index.py:1083
763            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
764
765            // Unknown detection on uncovered regions - Python: index.py:1093-1114
766            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
767            let filtered_unknown =
768                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
769
770            let mut all_matches = good_matches;
771            all_matches.extend(filtered_unknown);
772            // reinject weak matches and let refine matches keep the bests
773            // Python: index.py:1117-1118
774            all_matches.extend(weak_matches);
775            all_matches
776        } else {
777            merged_matches
778        };
779
780        // Step 5: Final refine WITH false positive filtering - Python: index.py:1130-1145
781        let refined = refine_matches(&self.index, refined_matches, &query);
782
783        let mut sorted = refined;
784        sort_matches_by_line(&mut sorted);
785
786        let groups = group_matches_by_region(&sorted);
787
788        let detections: Vec<LicenseDetection> = groups
789            .iter()
790            .map(|group| {
791                let mut detection = empty_detection();
792                populate_detection_from_group_with_spdx(
793                    &mut detection,
794                    group,
795                    &self.spdx_mapping,
796                    Some(content),
797                );
798                detection
799            })
800            .collect();
801
802        let detections = post_process_detections(detections, min_score);
803
804        Ok(detections)
805    }
806
807    pub fn detect_with_kind_and_source(
808        &self,
809        text: &str,
810        unknown_licenses: bool,
811        binary_derived: bool,
812        source_path: &str,
813    ) -> Result<Vec<LicenseDetection>> {
814        let mut detections = self.detect_with_kind(text, unknown_licenses, binary_derived)?;
815        attach_source_path_to_detections(&mut detections, source_path);
816        Ok(detections)
817    }
818
819    pub fn detect_with_kind_and_source_with_score(
820        &self,
821        text: &str,
822        unknown_licenses: bool,
823        binary_derived: bool,
824        source_path: &str,
825        min_score: f32,
826    ) -> Result<Vec<LicenseDetection>> {
827        let mut detections =
828            self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, min_score)?;
829        attach_source_path_to_detections(&mut detections, source_path);
830        Ok(detections)
831    }
832
833    /// Detect licenses and return raw matches (like Python's idx.match()).
834    ///
835    /// This is primarily used by golden tests and maintenance tooling that need
836    /// raw match sequences before grouping or post-processing into detections.
837    #[cfg(any(test, feature = "golden-tests"))]
838    pub fn detect_matches_with_kind(
839        &self,
840        text: &str,
841        unknown_licenses: bool,
842        binary_derived: bool,
843    ) -> Result<Vec<LicenseMatch>> {
844        let clean_text = strip_utf8_bom_str(text);
845
846        let content = truncate_detection_text(clean_text);
847
848        let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
849        let whole_query_run = query.whole_query_run();
850
851        let mut all_matches = Vec::new();
852        let mut candidate_contained_matches = Vec::new();
853        let mut aho_extra_matchables = PositionSet::new();
854        let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
855
856        // Phase 1a: Hash matching
857        {
858            let hash_matches = hash_match(&self.index, &whole_query_run);
859
860            if !hash_matches.is_empty() {
861                let mut matches = hash_matches;
862                sort_matches_by_line(&mut matches);
863                return Ok(matches);
864            }
865        }
866
867        // Phase 1b: SPDX-LID matching
868        {
869            let spdx_matches = spdx_lid_match(&self.index, &query);
870            subtract_spdx_match_qspans(
871                &mut query,
872                &mut matched_qspans,
873                &mut aho_extra_matchables,
874                &spdx_matches,
875            );
876            all_matches.extend(spdx_matches);
877        }
878
879        // Phase 1c: Aho-Corasick matching
880        {
881            let aho_matches = if aho_extra_matchables.is_empty() {
882                aho_match(&self.index, &whole_query_run)
883            } else {
884                aho_match::aho_match_with_extra_matchables(
885                    &self.index,
886                    &whole_query_run,
887                    Some(&aho_extra_matchables),
888                )
889            };
890            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
891            candidate_contained_matches.extend(refined_aho.clone());
892            let (merged_aho, _) = merge_and_prepare_aho_matches(
893                &self.index,
894                &mut query,
895                &mut matched_qspans,
896                &refined_aho,
897            );
898            all_matches.extend(merged_aho);
899
900            let whole_query_followup = collect_whole_query_exact_followup_matches(
901                &self.index,
902                &mut query,
903                &mut matched_qspans,
904                &whole_query_run,
905            );
906            all_matches.extend(whole_query_followup);
907
908            let merged_seq = collect_regular_seq_matches(
909                &self.index,
910                &query,
911                &matched_qspans,
912                &candidate_contained_matches,
913            );
914            all_matches.extend(merged_seq);
915        }
916
917        // Step 1: Initial refine WITHOUT false positive filtering
918        let merged_matches =
919            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
920
921        // Step 2: Unknown detection and weak match handling
922        let refined_matches = if unknown_licenses {
923            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
924            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
925            let filtered_unknown =
926                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
927
928            let mut all_matches = good_matches;
929            all_matches.extend(filtered_unknown);
930            all_matches.extend(weak_matches);
931            all_matches
932        } else {
933            merged_matches
934        };
935
936        // Step 3: Final refine WITH false positive filtering - Python: index.py:1130-1145
937        let refined = refine_matches(&self.index, refined_matches, &query);
938
939        let mut sorted = refined;
940        sort_matches_by_line(&mut sorted);
941
942        // Return raw matches (NOT grouped) - this is Python's idx.match() behavior
943        Ok(sorted)
944    }
945
946    /// Get a reference to the license index.
947    pub fn index(&self) -> &index::LicenseIndex {
948        &self.index
949    }
950
951    pub fn spdx_license_list_version(&self) -> Option<&str> {
952        self.spdx_license_list_version.as_deref()
953    }
954
955    /// Get a reference to the SPDX mapping.
956    #[cfg(test)]
957    pub fn spdx_mapping(&self) -> &SpdxMapping {
958        &self.spdx_mapping
959    }
960}
961
962pub fn detect_scancode_spdx_license_list_version(search_path: &Path) -> Result<Option<String>> {
963    for ancestor in search_path.ancestors() {
964        let candidate = ancestor.join("scancode_config.py");
965        if candidate.is_file() {
966            let config = fs::read_to_string(&candidate)?;
967            return Ok(parse_scancode_spdx_license_list_version(&config));
968        }
969    }
970
971    Ok(None)
972}
973
974fn parse_scancode_spdx_license_list_version(config: &str) -> Option<String> {
975    config.lines().find_map(|line| {
976        let trimmed = line.trim();
977        let (_, value) = trimmed.split_once('=')?;
978        (trimmed.starts_with("spdx_license_list_version")).then(|| {
979            value
980                .trim()
981                .trim_matches('"')
982                .trim_matches('\'')
983                .to_string()
984        })
985    })
986}
987
988#[cfg(test)]
989mod tests;