Skip to main content

provenant/license_detection/
mod.rs

1//! License Detection Engine
2
3pub mod aho_match;
4pub mod automaton;
5pub(crate) mod detection;
6pub mod embedded;
7pub mod license_cache;
8mod position_set;
9mod token_multiset;
10mod token_set;
11
12#[cfg(test)]
13mod embedded_test;
14pub mod expression;
15#[cfg(all(test, feature = "golden-tests"))]
16mod golden_test;
17#[cfg(feature = "golden-tests")]
18pub mod golden_utils;
19pub mod hash_match;
20pub mod index;
21mod match_refine;
22pub mod models;
23pub mod query;
24pub mod rules;
25pub mod seq_match;
26pub mod spdx_lid;
27pub mod spdx_mapping;
28#[cfg(test)]
29mod test_utils;
30pub mod tokenize;
31pub mod unknown_match;
32
33use bit_set::BitSet;
34use std::collections::HashSet;
35use std::fs;
36use std::path::Path;
37use std::sync::Arc;
38use std::time::Instant;
39
40use anyhow::Result;
41
42use crate::license_detection::embedded::index::{
43    load_embedded_artifact_metadata_from_bytes, load_loader_snapshot_from_bytes,
44};
45use crate::license_detection::index::build_index_from_loaded;
46use crate::license_detection::license_cache::{
47    LicenseCacheConfig, LicenseCacheNamespace, cache_file_size, compute_artifact_fingerprint,
48    compute_rules_fingerprint, delete_cache, load_cached_index, save_cached_index,
49};
50use crate::license_detection::query::Query;
51use crate::license_detection::rules::{
52    load_loaded_licenses_from_directory, load_loaded_rules_from_directory,
53};
54use crate::license_detection::spdx_mapping::{SpdxMapping, build_spdx_mapping};
55use crate::utils::text::strip_utf8_bom_str;
56
57use crate::license_detection::detection::{
58    attach_source_path_to_detections, empty_detection, populate_detection_from_group_with_spdx,
59};
60use crate::license_detection::models::MatcherKind;
61
62/// Path to the license rules directory in the reference scancode-toolkit submodule.
63/// Used by test code and the xtask generate-license-loader-artifact binary.
64#[allow(dead_code)]
65pub const SCANCODE_LICENSES_RULES_PATH: &str =
66    "reference/scancode-toolkit/src/licensedcode/data/rules";
67
68/// Path to the licenses directory in the reference scancode-toolkit submodule.
69/// Used by test code and the xtask generate-license-loader-artifact binary.
70#[allow(dead_code)]
71pub const SCANCODE_LICENSES_LICENSES_PATH: &str =
72    "reference/scancode-toolkit/src/licensedcode/data/licenses";
73
74/// Path to the license data directory in the reference scancode-toolkit submodule.
75/// Used by test code and the xtask generate-license-loader-artifact binary.
76#[allow(dead_code)]
77pub const SCANCODE_LICENSES_DATA_PATH: &str = "reference/scancode-toolkit/src/licensedcode/data";
78
79pub const DEFAULT_LICENSEDB_URL_TEMPLATE: &str = "https://scancode-licensedb.aboutcode.org/{}";
80
81pub(crate) use detection::{
82    LicenseDetection, group_matches_by_region, post_process_detections, sort_matches_by_line,
83};
84pub use models::LicenseMatch;
85
86pub use aho_match::aho_match;
87pub use hash_match::hash_match;
88pub use match_refine::{
89    filter_invalid_contained_unknown_matches, merge_overlapping_matches, refine_matches,
90    refine_matches_without_false_positive_filter, split_weak_matches,
91};
92pub use position_set::PositionSet;
93pub use spdx_lid::spdx_lid_match;
94pub use token_multiset::TokenMultiset;
95pub use token_set::TokenSet;
96pub use unknown_match::unknown_match;
97
98use self::seq_match::{MAX_NEAR_DUPE_CANDIDATES, select_seq_candidates, seq_match_with_candidates};
99
100/// License detection engine that orchestrates the detection pipeline.
101///
102/// The engine loads license rules and builds an index for efficient matching.
103/// It supports multiple matching strategies (hash, SPDX-LID, Aho-Corasick, sequence)
104/// and combines their results into final license detections.
105#[derive(Debug, Clone)]
106pub struct LicenseDetectionEngine {
107    index: Arc<index::LicenseIndex>,
108    spdx_mapping: SpdxMapping,
109    spdx_license_list_version: Option<String>,
110}
111
112const MAX_DETECTION_SIZE: usize = 10 * 1024 * 1024; // 10MB
113const MAX_REGULAR_SEQ_CANDIDATES: usize = 70;
114const MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP: usize = 8;
115const MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP: usize = 2;
116
117fn truncate_detection_text(clean_text: &str) -> &str {
118    if clean_text.len() <= MAX_DETECTION_SIZE {
119        return clean_text;
120    }
121
122    log::debug!(
123        "Content size {} exceeds limit {}, truncating for detection",
124        clean_text.len(),
125        MAX_DETECTION_SIZE
126    );
127
128    let boundary = clean_text.floor_char_boundary(MAX_DETECTION_SIZE);
129    &clean_text[..boundary]
130}
131
132fn query_span_for_match(m: &LicenseMatch) -> Option<models::PositionSpan> {
133    (!m.query_span().is_empty()).then(|| m.query_span().clone())
134}
135
136fn has_full_match_coverage(m: &LicenseMatch) -> bool {
137    m.coverage() == 100.0
138}
139
140fn is_redundant_same_expression_seq_container(
141    container: &LicenseMatch,
142    candidate_contained_matches: &[LicenseMatch],
143) -> bool {
144    let container_is_redundant_coverage =
145        has_full_match_coverage(container) || container.coverage() >= 99.0;
146    if container.matcher != MatcherKind::Seq || !container_is_redundant_coverage {
147        return false;
148    }
149
150    let container_qspan_set = container.qspan_set();
151
152    let mut contained: Vec<&LicenseMatch> = candidate_contained_matches
153        .iter()
154        .filter(|m| {
155            m.matcher == MatcherKind::Aho
156                && has_full_match_coverage(m)
157                && m.license_expression == container.license_expression
158                && m.overlaps_with(&container_qspan_set)
159        })
160        .collect();
161
162    if contained.len() < 2 {
163        return false;
164    }
165
166    let material_children = contained.iter().filter(|m| m.matched_length > 1).count();
167    if material_children < 2 {
168        return false;
169    }
170
171    contained.sort_by_key(|m| m.qspan_bounds());
172
173    let mut child_union = PositionSet::new();
174    for m in &contained {
175        child_union.extend_from_span(m.query_span());
176    }
177
178    let container_only_positions = container_qspan_set.difference(&child_union);
179    let child_only_positions = child_union.difference(&container_qspan_set);
180
181    let mut bridge_positions = BitSet::new();
182    for pair in contained.windows(2) {
183        let (_, previous_end) = pair[0].qspan_bounds();
184        let (next_start, _) = pair[1].qspan_bounds();
185
186        if next_start < previous_end {
187            return false;
188        }
189
190        for pos in previous_end..next_start {
191            bridge_positions.insert(pos);
192        }
193    }
194
195    let container_only_boundary_positions = container_only_positions
196        .iter()
197        .filter(|&pos| !bridge_positions.contains(pos))
198        .count();
199
200    if container_only_positions.len() == 1
201        && container_only_boundary_positions == 0
202        && child_only_positions.is_empty()
203    {
204        return false;
205    }
206
207    if child_only_positions.is_empty()
208        && container_only_positions.len() == container_only_boundary_positions
209        && container_only_boundary_positions <= 3
210    {
211        let earliest_child = contained
212            .iter()
213            .map(|m| m.qspan_bounds().0)
214            .min()
215            .unwrap_or(usize::MAX);
216        let latest_child = contained
217            .iter()
218            .map(|m| m.qspan_bounds().1.saturating_sub(1))
219            .max()
220            .unwrap_or(0);
221
222        let is_one_sided_boundary = container_only_positions
223            .iter()
224            .all(|pos| pos < earliest_child)
225            || container_only_positions
226                .iter()
227                .all(|pos| pos > latest_child);
228
229        if is_one_sided_boundary {
230            return false;
231        }
232    }
233
234    let max_container_only_positions =
235        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * contained.len() + 1;
236    let max_container_boundary_positions =
237        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * (contained.len() - 1);
238    let max_child_only_positions = MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP + 1;
239
240    container_only_positions.len() <= max_container_only_positions
241        && container_only_boundary_positions <= max_container_boundary_positions
242        && child_only_positions.len() <= max_child_only_positions
243}
244
245fn filter_redundant_same_expression_seq_containers(
246    seq_matches: Vec<LicenseMatch>,
247    candidate_contained_matches: &[LicenseMatch],
248) -> Vec<LicenseMatch> {
249    seq_matches
250        .into_iter()
251        .filter(|m| !is_redundant_same_expression_seq_container(m, candidate_contained_matches))
252        .collect()
253}
254
255fn is_redundant_low_coverage_composite_seq_wrapper(
256    container: &LicenseMatch,
257    candidate_contained_matches: &[LicenseMatch],
258) -> bool {
259    if container.matcher != seq_match::MATCH_SEQ || container.coverage() >= 30.0 {
260        return false;
261    }
262
263    let container_qspan_set = container.qspan_set();
264
265    let children: Vec<&LicenseMatch> = candidate_contained_matches
266        .iter()
267        .filter(|m| {
268            m.matcher == aho_match::MATCH_AHO
269                && has_full_match_coverage(m)
270                && m.license_expression != container.license_expression
271                && m.overlaps_with(&container_qspan_set)
272        })
273        .collect();
274
275    if children.len() < 2 {
276        return false;
277    }
278
279    let unique_expressions: HashSet<&str> = children
280        .iter()
281        .map(|m| m.license_expression.as_str())
282        .collect();
283    if unique_expressions.len() < 2 {
284        return false;
285    }
286
287    let mut child_union = PositionSet::new();
288    for m in &children {
289        child_union.extend_from_span(m.query_span());
290    }
291
292    let container_only_positions = container_qspan_set.difference(&child_union);
293    let child_only_positions = child_union.difference(&container_qspan_set);
294
295    let mut sorted_children = children;
296    sorted_children.sort_by_key(|m| m.qspan_bounds());
297
298    let mut bridge_positions = BitSet::new();
299    for pair in sorted_children.windows(2) {
300        let (_, previous_end) = pair[0].qspan_bounds();
301        let (next_start, _) = pair[1].qspan_bounds();
302        for pos in previous_end..next_start {
303            bridge_positions.insert(pos);
304        }
305    }
306
307    let container_only_boundary_positions = container_only_positions
308        .iter()
309        .filter(|&pos| !bridge_positions.contains(pos))
310        .count();
311
312    child_only_positions.is_empty()
313        && container_only_positions.len() <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
314        && container_only_boundary_positions <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
315}
316
317fn filter_redundant_low_coverage_composite_seq_wrappers(
318    seq_matches: Vec<LicenseMatch>,
319    candidate_contained_matches: &[LicenseMatch],
320) -> Vec<LicenseMatch> {
321    seq_matches
322        .into_iter()
323        .filter(|m| {
324            !is_redundant_low_coverage_composite_seq_wrapper(m, candidate_contained_matches)
325        })
326        .collect()
327}
328
329fn subtract_spdx_match_qspans(
330    query: &mut Query<'_>,
331    matched_qspans: &mut Vec<models::PositionSpan>,
332    aho_extra_matchables: &mut PositionSet,
333    spdx_matches: &[LicenseMatch],
334) {
335    for m in spdx_matches {
336        let Some(span) = query_span_for_match(m) else {
337            continue;
338        };
339
340        aho_extra_matchables.extend_from_span(&span);
341        query.subtract(&span);
342
343        if has_full_match_coverage(m) {
344            matched_qspans.push(span);
345        }
346    }
347}
348
349fn merge_and_prepare_aho_matches(
350    index: &index::LicenseIndex,
351    query: &mut Query<'_>,
352    matched_qspans: &mut Vec<models::PositionSpan>,
353    refined_aho: &[LicenseMatch],
354) -> (Vec<LicenseMatch>, bool) {
355    let merged_aho = merge_overlapping_matches(refined_aho);
356    let mut saw_long_exact_license_text_match = false;
357
358    for m in &merged_aho {
359        let Some(span) = query_span_for_match(m) else {
360            continue;
361        };
362
363        if has_full_match_coverage(m) {
364            matched_qspans.push(span.clone());
365        }
366
367        if index
368            .rules_by_rid
369            .get(m.rid)
370            .is_some_and(|rule| rule.is_license_text())
371            && m.rule_length > 120
372            && m.coverage() > 98.0
373        {
374            query.subtract(&span);
375            saw_long_exact_license_text_match = true;
376        }
377    }
378
379    (merged_aho, saw_long_exact_license_text_match)
380}
381
382fn collect_whole_query_exact_followup_matches(
383    index: &index::LicenseIndex,
384    query: &mut Query<'_>,
385    matched_qspans: &mut Vec<models::PositionSpan>,
386    whole_run: &query::QueryRun<'_>,
387) -> Vec<LicenseMatch> {
388    let mut seq_all_matches = Vec::new();
389
390    if whole_run.is_matchable(false, matched_qspans) {
391        let near_dupe_candidates =
392            select_seq_candidates(index, whole_run, true, MAX_NEAR_DUPE_CANDIDATES);
393
394        if !near_dupe_candidates.is_empty() {
395            let near_dupe_matches =
396                seq_match_with_candidates(index, whole_run, &near_dupe_candidates);
397
398            for m in &near_dupe_matches {
399                if !m.query_span().is_empty() {
400                    let span = m.query_span().clone();
401                    query.subtract(&span);
402                    matched_qspans.push(span);
403                }
404            }
405
406            seq_all_matches.extend(near_dupe_matches);
407        }
408    }
409
410    seq_all_matches
411}
412
413fn collect_regular_seq_matches(
414    index: &index::LicenseIndex,
415    query: &Query<'_>,
416    matched_qspans: &[models::PositionSpan],
417    candidate_contained_matches: &[LicenseMatch],
418) -> Vec<LicenseMatch> {
419    let mut seq_all_matches = Vec::new();
420
421    for query_run in query.query_runs() {
422        if !query_run.is_matchable(false, matched_qspans) {
423            continue;
424        }
425
426        let candidates =
427            select_seq_candidates(index, &query_run, false, MAX_REGULAR_SEQ_CANDIDATES);
428        if !candidates.is_empty() {
429            let matches = seq_match_with_candidates(index, &query_run, &candidates);
430            seq_all_matches.extend(matches);
431        }
432    }
433
434    let merged_seq = merge_overlapping_matches(&seq_all_matches);
435    let filtered_same_expression =
436        filter_redundant_same_expression_seq_containers(merged_seq, candidate_contained_matches);
437    filter_redundant_low_coverage_composite_seq_wrappers(
438        filtered_same_expression,
439        candidate_contained_matches,
440    )
441}
442
443impl LicenseDetectionEngine {
444    /// Create a new license detection engine from a pre-built license index.
445    ///
446    /// This is an internal constructor used by `from_directory()` and `from_embedded()`.
447    /// It builds the SPDX mapping from the licenses in the index.
448    fn from_index(
449        index: index::LicenseIndex,
450        spdx_license_list_version: Option<String>,
451    ) -> Result<Self> {
452        let mut license_vec: Vec<_> = index.licenses_by_key.values().cloned().collect();
453        license_vec.sort_by(|a, b| a.key.cmp(&b.key));
454        let spdx_mapping = build_spdx_mapping(&license_vec);
455
456        Ok(Self {
457            index: Arc::new(index),
458            spdx_mapping,
459            spdx_license_list_version,
460        })
461    }
462
463    /// Create a new license detection engine from the embedded license index.
464    ///
465    /// Convenience method that uses the default Provenant cache root and does
466    /// not force a reindex.
467    pub fn from_embedded() -> Result<Self> {
468        let cache_config =
469            LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
470        Self::from_embedded_with_cache(&cache_config)
471    }
472
473    /// Create a new license detection engine from the embedded license index.
474    ///
475    /// This method loads the build-time embedded license artifact and constructs
476    /// the runtime license index. This eliminates the runtime dependency on the
477    /// ScanCode rules directory.
478    ///
479    /// If a valid cache exists (matching fingerprint), the index is loaded from
480    /// the rkyv cache file instead of being rebuilt from scratch.
481    ///
482    /// # Arguments
483    /// * `cache_config` - Cache configuration (directory and reindex flag)
484    ///
485    /// # Returns
486    /// A Result containing the engine or an error
487    pub fn from_embedded_with_cache(cache_config: &LicenseCacheConfig) -> Result<Self> {
488        let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
489        let fingerprint = compute_artifact_fingerprint(artifact_bytes);
490
491        if !cache_config.reindex {
492            if let Some(cached) =
493                load_cached_index(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?
494            {
495                let start = Instant::now();
496                let spdx_version = cached.spdx_license_list_version.clone();
497                eprintln!(
498                    "License index loaded from rkyv cache in {:.2}s",
499                    start.elapsed().as_secs_f64()
500                );
501                return Self::from_index(cached, spdx_version);
502            }
503        } else {
504            delete_cache(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?;
505        }
506
507        let snapshot = load_loader_snapshot_from_bytes(artifact_bytes)
508            .map_err(|e| anyhow::anyhow!("Failed to load embedded license index: {}", e))?;
509        let spdx_version = Some(snapshot.metadata.spdx_license_list_version.clone());
510
511        let start = Instant::now();
512        let index = build_index_from_loaded(snapshot.rules, snapshot.licenses, false);
513        eprintln!(
514            "License index built from embedded artifact in {:.2}s",
515            start.elapsed().as_secs_f64()
516        );
517
518        let mut index = index;
519        index.spdx_license_list_version = spdx_version.clone();
520        if let Err(e) = save_cached_index(
521            cache_config,
522            LicenseCacheNamespace::Embedded,
523            &index,
524            &fingerprint,
525        ) {
526            eprintln!("Warning: failed to save license index cache: {}", e);
527        } else if let Some(size) =
528            cache_file_size(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)
529        {
530            eprintln!(
531                "License index cache saved ({:.1} MB)",
532                size as f64 / 1_048_576.0
533            );
534        }
535
536        Self::from_index(index, spdx_version)
537    }
538
539    /// Create a new license detection engine from a directory of license rules.
540    ///
541    /// Convenience method that uses the default Provenant cache root and does
542    /// not force a reindex.
543    pub fn from_directory(rules_path: &Path) -> Result<Self> {
544        let cache_config =
545            LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
546        Self::from_directory_with_cache(rules_path, &cache_config)
547    }
548
549    /// Create a new license detection engine from a directory of license rules.
550    ///
551    /// If a valid cache exists (matching fingerprint of the rules), the index is
552    /// loaded from the rkyv cache file instead of being rebuilt from scratch.
553    ///
554    /// # Arguments
555    /// * `rules_path` - Path to directory containing .LICENSE and .RULE files
556    /// * `cache_config` - Cache configuration (directory and reindex flag)
557    ///
558    /// # Returns
559    /// A Result containing the engine or an error
560    pub fn from_directory_with_cache(
561        rules_path: &Path,
562        cache_config: &LicenseCacheConfig,
563    ) -> Result<Self> {
564        let (rules_dir, licenses_dir) = if rules_path.ends_with("data") {
565            (rules_path.join("rules"), rules_path.join("licenses"))
566        } else if rules_path.ends_with("rules") {
567            let parent = rules_path.parent().ok_or_else(|| {
568                anyhow::anyhow!("Cannot determine parent directory for rules path")
569            })?;
570            (rules_path.to_path_buf(), parent.join("licenses"))
571        } else {
572            (rules_path.to_path_buf(), rules_path.to_path_buf())
573        };
574
575        let loaded_rules = load_loaded_rules_from_directory(&rules_dir)?;
576        let loaded_licenses = load_loaded_licenses_from_directory(&licenses_dir)?;
577
578        let fingerprint = compute_rules_fingerprint(&loaded_rules, &loaded_licenses);
579
580        if !cache_config.reindex {
581            if let Some(cached) = load_cached_index(
582                cache_config,
583                LicenseCacheNamespace::CustomRules,
584                &fingerprint,
585            )? {
586                let start = Instant::now();
587                eprintln!(
588                    "License index loaded from rkyv cache in {:.2}s",
589                    start.elapsed().as_secs_f64()
590                );
591                let spdx_version = detect_scancode_spdx_license_list_version(&rules_dir)?;
592                return Self::from_index(cached, spdx_version);
593            }
594        } else {
595            delete_cache(
596                cache_config,
597                LicenseCacheNamespace::CustomRules,
598                &fingerprint,
599            )?;
600        }
601
602        let start = Instant::now();
603        let index = build_index_from_loaded(loaded_rules, loaded_licenses, false);
604        eprintln!(
605            "License index built from rules directory in {:.2}s",
606            start.elapsed().as_secs_f64()
607        );
608
609        let spdx_license_list_version = detect_scancode_spdx_license_list_version(&rules_dir)?;
610
611        if let Err(e) = save_cached_index(
612            cache_config,
613            LicenseCacheNamespace::CustomRules,
614            &index,
615            &fingerprint,
616        ) {
617            eprintln!("Warning: failed to save license index cache: {}", e);
618        } else if let Some(size) = cache_file_size(
619            cache_config,
620            LicenseCacheNamespace::CustomRules,
621            &fingerprint,
622        ) {
623            eprintln!(
624                "License index cache saved ({:.1} MB)",
625                size as f64 / 1_048_576.0
626            );
627        }
628
629        Self::from_index(index, spdx_license_list_version)
630    }
631
632    pub fn embedded_spdx_license_list_version() -> Result<String> {
633        let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
634        Ok(load_embedded_artifact_metadata_from_bytes(artifact_bytes)
635            .map_err(|e| {
636                anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
637            })?
638            .spdx_license_list_version)
639    }
640
641    pub fn detect_with_kind(
642        &self,
643        text: &str,
644        unknown_licenses: bool,
645        binary_derived: bool,
646    ) -> Result<Vec<LicenseDetection>> {
647        self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, 0.0)
648    }
649
650    pub fn detect_with_kind_with_score(
651        &self,
652        text: &str,
653        unknown_licenses: bool,
654        binary_derived: bool,
655        min_score: f32,
656    ) -> Result<Vec<LicenseDetection>> {
657        let clean_text = strip_utf8_bom_str(text);
658
659        let content = truncate_detection_text(clean_text);
660
661        let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
662        let whole_query_run = query.whole_query_run();
663
664        let mut all_matches = Vec::new();
665        let mut candidate_contained_matches = Vec::new();
666        let mut aho_extra_matchables = PositionSet::new();
667        let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
668
669        // Phase 1a: Hash matching
670        // Python returns immediately if hash matches found (index.py:987-991)
671        {
672            let hash_matches = hash_match(&self.index, &whole_query_run);
673
674            if !hash_matches.is_empty() {
675                let mut matches = hash_matches;
676                sort_matches_by_line(&mut matches);
677
678                let groups = group_matches_by_region(&matches);
679                let detections: Vec<LicenseDetection> = groups
680                    .iter()
681                    .map(|group| {
682                        let mut detection = empty_detection();
683                        populate_detection_from_group_with_spdx(
684                            &mut detection,
685                            group,
686                            &self.spdx_mapping,
687                            Some(content),
688                        );
689                        detection
690                    })
691                    .collect();
692
693                return Ok(post_process_detections(detections, min_score));
694            }
695        }
696
697        // Phase 1b: SPDX-LID matching
698        {
699            let spdx_matches = spdx_lid_match(&self.index, &query);
700            subtract_spdx_match_qspans(
701                &mut query,
702                &mut matched_qspans,
703                &mut aho_extra_matchables,
704                &spdx_matches,
705            );
706            all_matches.extend(spdx_matches);
707        }
708
709        // Phase 1c: Aho-Corasick matching
710        {
711            let aho_matches = if aho_extra_matchables.is_empty() {
712                aho_match(&self.index, &whole_query_run)
713            } else {
714                aho_match::aho_match_with_extra_matchables(
715                    &self.index,
716                    &whole_query_run,
717                    Some(&aho_extra_matchables),
718                )
719            };
720
721            // Python's get_exact_matches() calls refine_matches with merge=False
722            // This applies quality filters including required phrase filtering
723            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
724            candidate_contained_matches.extend(refined_aho.clone());
725            let (merged_aho, _) = merge_and_prepare_aho_matches(
726                &self.index,
727                &mut query,
728                &mut matched_qspans,
729                &refined_aho,
730            );
731            all_matches.extend(merged_aho);
732
733            let whole_query_followup = collect_whole_query_exact_followup_matches(
734                &self.index,
735                &mut query,
736                &mut matched_qspans,
737                &whole_query_run,
738            );
739            all_matches.extend(whole_query_followup);
740
741            let merged_seq = collect_regular_seq_matches(
742                &self.index,
743                &query,
744                &matched_qspans,
745                &candidate_contained_matches,
746            );
747            all_matches.extend(merged_seq);
748        }
749
750        // Step 1: Initial refine WITHOUT false positive filtering
751        // Python: refine_matches with filter_false_positive=False (index.py:1073-1080)
752        let merged_matches =
753            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
754
755        // Step 2: Unknown detection and weak match handling
756        // Python: index.py:1079-1118 - only runs when unknown_licenses=True
757        let refined_matches = if unknown_licenses {
758            // Split weak from good - Python: index.py:1083
759            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
760
761            // Unknown detection on uncovered regions - Python: index.py:1093-1114
762            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
763            let filtered_unknown =
764                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
765
766            let mut all_matches = good_matches;
767            all_matches.extend(filtered_unknown);
768            // reinject weak matches and let refine matches keep the bests
769            // Python: index.py:1117-1118
770            all_matches.extend(weak_matches);
771            all_matches
772        } else {
773            merged_matches
774        };
775
776        // Step 5: Final refine WITH false positive filtering - Python: index.py:1130-1145
777        let refined = refine_matches(&self.index, refined_matches, &query);
778
779        let mut sorted = refined;
780        sort_matches_by_line(&mut sorted);
781
782        let groups = group_matches_by_region(&sorted);
783
784        let detections: Vec<LicenseDetection> = groups
785            .iter()
786            .map(|group| {
787                let mut detection = empty_detection();
788                populate_detection_from_group_with_spdx(
789                    &mut detection,
790                    group,
791                    &self.spdx_mapping,
792                    Some(content),
793                );
794                detection
795            })
796            .collect();
797
798        let detections = post_process_detections(detections, min_score);
799
800        Ok(detections)
801    }
802
803    pub fn detect_with_kind_and_source(
804        &self,
805        text: &str,
806        unknown_licenses: bool,
807        binary_derived: bool,
808        source_path: &str,
809    ) -> Result<Vec<LicenseDetection>> {
810        let mut detections = self.detect_with_kind(text, unknown_licenses, binary_derived)?;
811        attach_source_path_to_detections(&mut detections, source_path);
812        Ok(detections)
813    }
814
815    pub fn detect_with_kind_and_source_with_score(
816        &self,
817        text: &str,
818        unknown_licenses: bool,
819        binary_derived: bool,
820        source_path: &str,
821        min_score: f32,
822    ) -> Result<Vec<LicenseDetection>> {
823        let mut detections =
824            self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, min_score)?;
825        attach_source_path_to_detections(&mut detections, source_path);
826        Ok(detections)
827    }
828
829    /// Detect licenses and return raw matches (like Python's idx.match()).
830    ///
831    /// This is primarily used by golden tests and maintenance tooling that need
832    /// raw match sequences before grouping or post-processing into detections.
833    #[cfg(any(test, feature = "golden-tests"))]
834    pub fn detect_matches_with_kind(
835        &self,
836        text: &str,
837        unknown_licenses: bool,
838        binary_derived: bool,
839    ) -> Result<Vec<LicenseMatch>> {
840        let clean_text = strip_utf8_bom_str(text);
841
842        let content = truncate_detection_text(clean_text);
843
844        let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
845        let whole_query_run = query.whole_query_run();
846
847        let mut all_matches = Vec::new();
848        let mut candidate_contained_matches = Vec::new();
849        let mut aho_extra_matchables = PositionSet::new();
850        let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
851
852        // Phase 1a: Hash matching
853        {
854            let hash_matches = hash_match(&self.index, &whole_query_run);
855
856            if !hash_matches.is_empty() {
857                let mut matches = hash_matches;
858                sort_matches_by_line(&mut matches);
859                return Ok(matches);
860            }
861        }
862
863        // Phase 1b: SPDX-LID matching
864        {
865            let spdx_matches = spdx_lid_match(&self.index, &query);
866            subtract_spdx_match_qspans(
867                &mut query,
868                &mut matched_qspans,
869                &mut aho_extra_matchables,
870                &spdx_matches,
871            );
872            all_matches.extend(spdx_matches);
873        }
874
875        // Phase 1c: Aho-Corasick matching
876        {
877            let aho_matches = if aho_extra_matchables.is_empty() {
878                aho_match(&self.index, &whole_query_run)
879            } else {
880                aho_match::aho_match_with_extra_matchables(
881                    &self.index,
882                    &whole_query_run,
883                    Some(&aho_extra_matchables),
884                )
885            };
886            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
887            candidate_contained_matches.extend(refined_aho.clone());
888            let (merged_aho, _) = merge_and_prepare_aho_matches(
889                &self.index,
890                &mut query,
891                &mut matched_qspans,
892                &refined_aho,
893            );
894            all_matches.extend(merged_aho);
895
896            let whole_query_followup = collect_whole_query_exact_followup_matches(
897                &self.index,
898                &mut query,
899                &mut matched_qspans,
900                &whole_query_run,
901            );
902            all_matches.extend(whole_query_followup);
903
904            let merged_seq = collect_regular_seq_matches(
905                &self.index,
906                &query,
907                &matched_qspans,
908                &candidate_contained_matches,
909            );
910            all_matches.extend(merged_seq);
911        }
912
913        // Step 1: Initial refine WITHOUT false positive filtering
914        let merged_matches =
915            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
916
917        // Step 2: Unknown detection and weak match handling
918        let refined_matches = if unknown_licenses {
919            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
920            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
921            let filtered_unknown =
922                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
923
924            let mut all_matches = good_matches;
925            all_matches.extend(filtered_unknown);
926            all_matches.extend(weak_matches);
927            all_matches
928        } else {
929            merged_matches
930        };
931
932        // Step 3: Final refine WITH false positive filtering - Python: index.py:1130-1145
933        let refined = refine_matches(&self.index, refined_matches, &query);
934
935        let mut sorted = refined;
936        sort_matches_by_line(&mut sorted);
937
938        // Return raw matches (NOT grouped) - this is Python's idx.match() behavior
939        Ok(sorted)
940    }
941
942    /// Get a reference to the license index.
943    pub fn index(&self) -> &index::LicenseIndex {
944        &self.index
945    }
946
947    pub fn spdx_license_list_version(&self) -> Option<&str> {
948        self.spdx_license_list_version.as_deref()
949    }
950
951    /// Get a reference to the SPDX mapping.
952    #[cfg(test)]
953    pub fn spdx_mapping(&self) -> &SpdxMapping {
954        &self.spdx_mapping
955    }
956}
957
958pub fn detect_scancode_spdx_license_list_version(search_path: &Path) -> Result<Option<String>> {
959    for ancestor in search_path.ancestors() {
960        let candidate = ancestor.join("scancode_config.py");
961        if candidate.is_file() {
962            let config = fs::read_to_string(&candidate)?;
963            return Ok(parse_scancode_spdx_license_list_version(&config));
964        }
965    }
966
967    Ok(None)
968}
969
970fn parse_scancode_spdx_license_list_version(config: &str) -> Option<String> {
971    config.lines().find_map(|line| {
972        let trimmed = line.trim();
973        let (_, value) = trimmed.split_once('=')?;
974        (trimmed.starts_with("spdx_license_list_version")).then(|| {
975            value
976                .trim()
977                .trim_matches('"')
978                .trim_matches('\'')
979                .to_string()
980        })
981    })
982}
983
984#[cfg(test)]
985mod tests;