Skip to main content

provenant/license_detection/
mod.rs

1//! License Detection Engine
2
3pub mod aho_match;
4pub mod automaton;
5pub mod build_policy;
6pub mod dataset;
7pub(crate) mod detection;
8pub mod embedded;
9pub mod license_cache;
10mod position_set;
11mod token_multiset;
12mod token_set;
13
14#[cfg(test)]
15mod embedded_test;
16pub mod expression;
17#[cfg(all(test, feature = "golden-tests"))]
18mod golden_test;
19#[cfg(feature = "golden-tests")]
20pub mod golden_utils;
21pub mod hash_match;
22pub mod index;
23mod match_refine;
24pub mod models;
25pub mod query;
26pub mod rules;
27pub mod seq_match;
28pub mod spdx_lid;
29pub mod spdx_mapping;
30#[cfg(test)]
31mod test_utils;
32pub mod tokenize;
33pub mod unknown_match;
34
35use bit_set::BitSet;
36use std::collections::HashSet;
37use std::fs;
38use std::path::Path;
39use std::sync::Arc;
40use std::time::Instant;
41
42use anyhow::Result;
43
44use crate::license_detection::build_policy::EMBEDDED_LICENSE_INDEX_SOURCE;
45use crate::license_detection::dataset::{
46    CUSTOM_LICENSE_DATASET_SOURCE, LoadedLicenseDataset, compute_dataset_fingerprint_string,
47    load_license_dataset_from_root,
48};
49use crate::license_detection::embedded::index::{
50    load_embedded_artifact_metadata_from_bytes, load_loader_snapshot_from_bytes,
51};
52use crate::license_detection::index::build_index_from_loaded;
53use crate::license_detection::license_cache::{
54    LicenseCacheConfig, LicenseCacheNamespace, cache_file_size, compute_artifact_fingerprint,
55    compute_rules_fingerprint, delete_cache, load_cached_index, save_cached_index,
56};
57use crate::license_detection::query::Query;
58use crate::license_detection::spdx_mapping::{SpdxMapping, build_spdx_mapping};
59use crate::models::LicenseIndexProvenance;
60use crate::utils::text::strip_utf8_bom_str;
61
62use crate::license_detection::detection::{
63    attach_source_path_to_detections, empty_detection, populate_detection_from_group_with_spdx,
64};
65use crate::license_detection::models::MatcherKind;
66
67/// Path to the license rules directory in the reference scancode-toolkit submodule.
68/// Used by test code and the xtask generate-license-loader-artifact binary.
69#[allow(dead_code)]
70pub const SCANCODE_LICENSES_RULES_PATH: &str =
71    "reference/scancode-toolkit/src/licensedcode/data/rules";
72
73/// Path to the licenses directory in the reference scancode-toolkit submodule.
74/// Used by test code and the xtask generate-license-loader-artifact binary.
75#[allow(dead_code)]
76pub const SCANCODE_LICENSES_LICENSES_PATH: &str =
77    "reference/scancode-toolkit/src/licensedcode/data/licenses";
78
79/// Path to the license data directory in the reference scancode-toolkit submodule.
80/// Used by test code and the xtask generate-license-loader-artifact binary.
81#[allow(dead_code)]
82pub const SCANCODE_LICENSES_DATA_PATH: &str = "reference/scancode-toolkit/src/licensedcode/data";
83
84pub const DEFAULT_LICENSEDB_URL_TEMPLATE: &str = "https://scancode-licensedb.aboutcode.org/{}";
85pub(crate) const LICENSE_DETECTION_TIMEOUT_MESSAGE: &str = "license detection timed out";
86
87pub(crate) use detection::{
88    LicenseDetection, group_matches_by_region, post_process_detections, sort_matches_by_line,
89};
90pub use models::LicenseMatch;
91
92pub use aho_match::aho_match;
93pub use hash_match::hash_match;
94pub use match_refine::{
95    filter_invalid_contained_unknown_matches, merge_overlapping_matches, refine_matches,
96    refine_matches_without_false_positive_filter, split_weak_matches,
97};
98pub use position_set::PositionSet;
99pub use spdx_lid::spdx_lid_match;
100pub use token_multiset::TokenMultiset;
101pub use token_set::TokenSet;
102pub use unknown_match::unknown_match;
103
104use self::seq_match::{
105    MAX_NEAR_DUPE_CANDIDATES, select_seq_candidates_with_deadline,
106    seq_match_with_candidates_and_deadline,
107};
108
109/// License detection engine that orchestrates the detection pipeline.
110///
111/// The engine loads license rules and builds an index for efficient matching.
112/// It supports multiple matching strategies (hash, SPDX-LID, Aho-Corasick, sequence)
113/// and combines their results into final license detections.
114#[derive(Debug, Clone)]
115pub struct LicenseDetectionEngine {
116    index: Arc<index::LicenseIndex>,
117    spdx_mapping: SpdxMapping,
118    spdx_license_list_version: Option<String>,
119    license_index_provenance: Option<LicenseIndexProvenance>,
120}
121
122const MAX_DETECTION_SIZE: usize = 10 * 1024 * 1024; // 10MB
123const MAX_REGULAR_SEQ_CANDIDATES: usize = 70;
124const MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP: usize = 8;
125const MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP: usize = 2;
126
127pub(crate) fn deadline_exceeded(deadline: Option<Instant>) -> bool {
128    deadline.is_some_and(|deadline| Instant::now() >= deadline)
129}
130
131pub(crate) fn ensure_within_deadline(deadline: Option<Instant>) -> Result<()> {
132    if deadline_exceeded(deadline) {
133        Err(anyhow::anyhow!(LICENSE_DETECTION_TIMEOUT_MESSAGE))
134    } else {
135        Ok(())
136    }
137}
138
139fn truncate_detection_text(clean_text: &str) -> &str {
140    if clean_text.len() <= MAX_DETECTION_SIZE {
141        return clean_text;
142    }
143
144    log::debug!(
145        "Content size {} exceeds limit {}, truncating for detection",
146        clean_text.len(),
147        MAX_DETECTION_SIZE
148    );
149
150    let boundary = clean_text.floor_char_boundary(MAX_DETECTION_SIZE);
151    &clean_text[..boundary]
152}
153
154fn query_span_for_match(m: &LicenseMatch) -> Option<models::PositionSpan> {
155    (!m.query_span().is_empty()).then(|| m.query_span().clone())
156}
157
158fn has_full_match_coverage(m: &LicenseMatch) -> bool {
159    m.coverage() == 100.0
160}
161
162fn is_redundant_same_expression_seq_container(
163    container: &LicenseMatch,
164    candidate_contained_matches: &[LicenseMatch],
165) -> bool {
166    let container_is_redundant_coverage =
167        has_full_match_coverage(container) || container.coverage() >= 99.0;
168    if container.matcher != MatcherKind::Seq || !container_is_redundant_coverage {
169        return false;
170    }
171
172    let container_qspan_set = container.qspan_set();
173
174    let mut contained: Vec<&LicenseMatch> = candidate_contained_matches
175        .iter()
176        .filter(|m| {
177            m.matcher == MatcherKind::Aho
178                && has_full_match_coverage(m)
179                && m.license_expression == container.license_expression
180                && m.overlaps_with(&container_qspan_set)
181        })
182        .collect();
183
184    if contained.len() < 2 {
185        return false;
186    }
187
188    let material_children = contained.iter().filter(|m| m.matched_length > 1).count();
189    if material_children < 2 {
190        return false;
191    }
192
193    contained.sort_by_key(|m| m.qspan_bounds());
194
195    let mut child_union = PositionSet::new();
196    for m in &contained {
197        child_union.extend_from_span(m.query_span());
198    }
199
200    let container_only_positions = container_qspan_set.difference(&child_union);
201    let child_only_positions = child_union.difference(&container_qspan_set);
202
203    let mut bridge_positions = BitSet::new();
204    for pair in contained.windows(2) {
205        let (_, previous_end) = pair[0].qspan_bounds();
206        let (next_start, _) = pair[1].qspan_bounds();
207
208        if next_start < previous_end {
209            return false;
210        }
211
212        for pos in previous_end..next_start {
213            bridge_positions.insert(pos);
214        }
215    }
216
217    let container_only_boundary_positions = container_only_positions
218        .iter()
219        .filter(|&pos| !bridge_positions.contains(pos))
220        .count();
221
222    if container_only_positions.len() == 1
223        && container_only_boundary_positions == 0
224        && child_only_positions.is_empty()
225    {
226        return false;
227    }
228
229    if child_only_positions.is_empty()
230        && container_only_positions.len() == container_only_boundary_positions
231        && container_only_boundary_positions <= 3
232    {
233        let earliest_child = contained
234            .iter()
235            .map(|m| m.qspan_bounds().0)
236            .min()
237            .unwrap_or(usize::MAX);
238        let latest_child = contained
239            .iter()
240            .map(|m| m.qspan_bounds().1.saturating_sub(1))
241            .max()
242            .unwrap_or(0);
243
244        let is_one_sided_boundary = container_only_positions
245            .iter()
246            .all(|pos| pos < earliest_child)
247            || container_only_positions
248                .iter()
249                .all(|pos| pos > latest_child);
250
251        if is_one_sided_boundary {
252            return false;
253        }
254    }
255
256    let max_container_only_positions =
257        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * contained.len() + 1;
258    let max_container_boundary_positions =
259        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * (contained.len() - 1);
260    let max_child_only_positions = MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP + 1;
261
262    container_only_positions.len() <= max_container_only_positions
263        && container_only_boundary_positions <= max_container_boundary_positions
264        && child_only_positions.len() <= max_child_only_positions
265}
266
267fn filter_redundant_same_expression_seq_containers(
268    seq_matches: Vec<LicenseMatch>,
269    candidate_contained_matches: &[LicenseMatch],
270) -> Vec<LicenseMatch> {
271    seq_matches
272        .into_iter()
273        .filter(|m| !is_redundant_same_expression_seq_container(m, candidate_contained_matches))
274        .collect()
275}
276
277fn is_redundant_low_coverage_composite_seq_wrapper(
278    container: &LicenseMatch,
279    candidate_contained_matches: &[LicenseMatch],
280) -> bool {
281    if container.matcher != seq_match::MATCH_SEQ || container.coverage() >= 30.0 {
282        return false;
283    }
284
285    let container_qspan_set = container.qspan_set();
286
287    let children: Vec<&LicenseMatch> = candidate_contained_matches
288        .iter()
289        .filter(|m| {
290            m.matcher == aho_match::MATCH_AHO
291                && has_full_match_coverage(m)
292                && m.license_expression != container.license_expression
293                && m.overlaps_with(&container_qspan_set)
294        })
295        .collect();
296
297    if children.len() < 2 {
298        return false;
299    }
300
301    let unique_expressions: HashSet<&str> = children
302        .iter()
303        .map(|m| m.license_expression.as_str())
304        .collect();
305    if unique_expressions.len() < 2 {
306        return false;
307    }
308
309    let mut child_union = PositionSet::new();
310    for m in &children {
311        child_union.extend_from_span(m.query_span());
312    }
313
314    let container_only_positions = container_qspan_set.difference(&child_union);
315    let child_only_positions = child_union.difference(&container_qspan_set);
316
317    let mut sorted_children = children;
318    sorted_children.sort_by_key(|m| m.qspan_bounds());
319
320    let mut bridge_positions = BitSet::new();
321    for pair in sorted_children.windows(2) {
322        let (_, previous_end) = pair[0].qspan_bounds();
323        let (next_start, _) = pair[1].qspan_bounds();
324        for pos in previous_end..next_start {
325            bridge_positions.insert(pos);
326        }
327    }
328
329    let container_only_boundary_positions = container_only_positions
330        .iter()
331        .filter(|&pos| !bridge_positions.contains(pos))
332        .count();
333
334    child_only_positions.is_empty()
335        && container_only_positions.len() <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
336        && container_only_boundary_positions <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
337}
338
339fn filter_redundant_low_coverage_composite_seq_wrappers(
340    seq_matches: Vec<LicenseMatch>,
341    candidate_contained_matches: &[LicenseMatch],
342) -> Vec<LicenseMatch> {
343    seq_matches
344        .into_iter()
345        .filter(|m| {
346            !is_redundant_low_coverage_composite_seq_wrapper(m, candidate_contained_matches)
347        })
348        .collect()
349}
350
351fn subtract_spdx_match_qspans(
352    query: &mut Query<'_>,
353    matched_qspans: &mut Vec<models::PositionSpan>,
354    aho_extra_matchables: &mut PositionSet,
355    spdx_matches: &[LicenseMatch],
356) {
357    for m in spdx_matches {
358        let Some(span) = query_span_for_match(m) else {
359            continue;
360        };
361
362        aho_extra_matchables.extend_from_span(&span);
363        query.subtract(&span);
364
365        if has_full_match_coverage(m) {
366            matched_qspans.push(span);
367        }
368    }
369}
370
371fn merge_and_prepare_aho_matches(
372    index: &index::LicenseIndex,
373    query: &mut Query<'_>,
374    matched_qspans: &mut Vec<models::PositionSpan>,
375    refined_aho: &[LicenseMatch],
376) -> (Vec<LicenseMatch>, bool) {
377    let merged_aho = merge_overlapping_matches(refined_aho);
378    let mut saw_long_exact_license_text_match = false;
379
380    for m in &merged_aho {
381        let Some(span) = query_span_for_match(m) else {
382            continue;
383        };
384
385        if has_full_match_coverage(m) {
386            matched_qspans.push(span.clone());
387        }
388
389        if index
390            .rules_by_rid
391            .get(m.rid)
392            .is_some_and(|rule| rule.is_license_text())
393            && m.rule_length > 120
394            && m.coverage() > 98.0
395        {
396            query.subtract(&span);
397            saw_long_exact_license_text_match = true;
398        }
399    }
400
401    (merged_aho, saw_long_exact_license_text_match)
402}
403
404fn collect_whole_query_exact_followup_matches(
405    index: &index::LicenseIndex,
406    query: &mut Query<'_>,
407    matched_qspans: &mut Vec<models::PositionSpan>,
408    whole_run: &query::QueryRun<'_>,
409    deadline: Option<Instant>,
410) -> Result<Vec<LicenseMatch>> {
411    let mut seq_all_matches = Vec::new();
412
413    if whole_run.is_matchable(false, matched_qspans) {
414        let near_dupe_candidates = if deadline.is_some() {
415            select_seq_candidates_with_deadline(
416                index,
417                whole_run,
418                true,
419                MAX_NEAR_DUPE_CANDIDATES,
420                deadline,
421            )?
422        } else {
423            self::seq_match::select_seq_candidates(index, whole_run, true, MAX_NEAR_DUPE_CANDIDATES)
424        };
425
426        if !near_dupe_candidates.is_empty() {
427            let near_dupe_matches = if deadline.is_some() {
428                seq_match_with_candidates_and_deadline(
429                    index,
430                    whole_run,
431                    &near_dupe_candidates,
432                    deadline,
433                )?
434            } else {
435                self::seq_match::seq_match_with_candidates(index, whole_run, &near_dupe_candidates)
436            };
437
438            for m in &near_dupe_matches {
439                if !m.query_span().is_empty() {
440                    let span = m.query_span().clone();
441                    query.subtract(&span);
442                    matched_qspans.push(span);
443                }
444            }
445
446            seq_all_matches.extend(near_dupe_matches);
447        }
448    }
449
450    Ok(seq_all_matches)
451}
452
453fn collect_regular_seq_matches(
454    index: &index::LicenseIndex,
455    query: &Query<'_>,
456    matched_qspans: &[models::PositionSpan],
457    candidate_contained_matches: &[LicenseMatch],
458    deadline: Option<Instant>,
459) -> Result<Vec<LicenseMatch>> {
460    let mut seq_all_matches = Vec::new();
461
462    for (query_run_index, query_run) in query.query_runs().into_iter().enumerate() {
463        if query_run_index % 8 == 0 {
464            ensure_within_deadline(deadline)?;
465        }
466
467        if !query_run.is_matchable(false, matched_qspans) {
468            continue;
469        }
470
471        let candidates = if deadline.is_some() {
472            select_seq_candidates_with_deadline(
473                index,
474                &query_run,
475                false,
476                MAX_REGULAR_SEQ_CANDIDATES,
477                deadline,
478            )?
479        } else {
480            self::seq_match::select_seq_candidates(
481                index,
482                &query_run,
483                false,
484                MAX_REGULAR_SEQ_CANDIDATES,
485            )
486        };
487        if !candidates.is_empty() {
488            let matches = if deadline.is_some() {
489                seq_match_with_candidates_and_deadline(index, &query_run, &candidates, deadline)?
490            } else {
491                self::seq_match::seq_match_with_candidates(index, &query_run, &candidates)
492            };
493            seq_all_matches.extend(matches);
494        }
495    }
496
497    let merged_seq = merge_overlapping_matches(&seq_all_matches);
498    let filtered_same_expression =
499        filter_redundant_same_expression_seq_containers(merged_seq, candidate_contained_matches);
500    Ok(filter_redundant_low_coverage_composite_seq_wrappers(
501        filtered_same_expression,
502        candidate_contained_matches,
503    ))
504}
505
506impl LicenseDetectionEngine {
507    /// Create a new license detection engine from a pre-built license index.
508    ///
509    /// This is an internal constructor used by `from_directory()` and `from_embedded()`.
510    /// It builds the SPDX mapping from the licenses in the index.
511    fn from_index(
512        index: index::LicenseIndex,
513        spdx_license_list_version: Option<String>,
514        license_index_provenance: Option<LicenseIndexProvenance>,
515    ) -> Result<Self> {
516        let mut license_vec: Vec<_> = index.licenses_by_key.values().cloned().collect();
517        license_vec.sort_by(|a, b| a.key.cmp(&b.key));
518        let spdx_mapping = build_spdx_mapping(&license_vec);
519
520        Ok(Self {
521            index: Arc::new(index),
522            spdx_mapping,
523            spdx_license_list_version,
524            license_index_provenance,
525        })
526    }
527
528    #[cfg(test)]
529    pub(crate) fn from_test_index(index: index::LicenseIndex) -> Self {
530        Self::from_index(index, None, None).expect("test index should build license engine")
531    }
532
533    /// Create a new license detection engine from the embedded license index.
534    ///
535    /// Convenience method that uses the default Provenant cache root and does
536    /// not force a reindex.
537    pub fn from_embedded() -> Result<Self> {
538        let cache_config =
539            LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
540        Self::from_embedded_with_cache(&cache_config)
541    }
542
543    /// Create a new license detection engine from the embedded license index.
544    ///
545    /// This method loads the build-time embedded license artifact and constructs
546    /// the runtime license index. This eliminates the runtime dependency on the
547    /// ScanCode rules directory.
548    ///
549    /// If a valid cache exists (matching fingerprint), the index is loaded from
550    /// the rkyv cache file instead of being rebuilt from scratch.
551    ///
552    /// # Arguments
553    /// * `cache_config` - Cache configuration (directory and reindex flag)
554    ///
555    /// # Returns
556    /// A Result containing the engine or an error
557    pub fn from_embedded_with_cache(cache_config: &LicenseCacheConfig) -> Result<Self> {
558        let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
559        let fingerprint = compute_artifact_fingerprint(artifact_bytes);
560        let artifact_metadata = load_embedded_artifact_metadata_from_bytes(artifact_bytes)
561            .map_err(|e| {
562                anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
563            })?;
564        debug_assert_eq!(
565            artifact_metadata.license_index_provenance.source,
566            EMBEDDED_LICENSE_INDEX_SOURCE
567        );
568        let spdx_version = Some(artifact_metadata.spdx_license_list_version.clone());
569        let provenance = Some(artifact_metadata.license_index_provenance.clone());
570
571        if !cache_config.reindex {
572            if let Some(cached) =
573                load_cached_index(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?
574            {
575                let start = Instant::now();
576                eprintln!(
577                    "License index loaded from rkyv cache in {:.2}s",
578                    start.elapsed().as_secs_f64()
579                );
580                return Self::from_index(cached, spdx_version, provenance);
581            }
582        } else {
583            delete_cache(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?;
584        }
585
586        let snapshot = load_loader_snapshot_from_bytes(artifact_bytes)
587            .map_err(|e| anyhow::anyhow!("Failed to load embedded license index: {}", e))?;
588        let spdx_version = Some(snapshot.metadata.spdx_license_list_version.clone());
589        let provenance = Some(snapshot.metadata.license_index_provenance.clone());
590
591        let start = Instant::now();
592        let index = build_index_from_loaded(snapshot.rules, snapshot.licenses, false);
593        eprintln!(
594            "License index built from embedded artifact in {:.2}s",
595            start.elapsed().as_secs_f64()
596        );
597
598        let mut index = index;
599        index.spdx_license_list_version = spdx_version.clone();
600        if let Err(e) = save_cached_index(
601            cache_config,
602            LicenseCacheNamespace::Embedded,
603            &index,
604            &fingerprint,
605        ) {
606            eprintln!("Warning: failed to save license index cache: {}", e);
607        } else if let Some(size) =
608            cache_file_size(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)
609        {
610            eprintln!(
611                "License index cache saved ({:.1} MB)",
612                size as f64 / 1_048_576.0
613            );
614        }
615
616        Self::from_index(index, spdx_version, provenance)
617    }
618
619    /// Create a new license detection engine from a license dataset root.
620    ///
621    /// Convenience method that uses the default Provenant cache root and does
622    /// not force a reindex.
623    pub fn from_directory(rules_path: &Path) -> Result<Self> {
624        let cache_config =
625            LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
626        Self::from_directory_with_cache(rules_path, &cache_config)
627    }
628
629    /// Create a new license detection engine from a directory of license rules.
630    ///
631    /// If a valid cache exists (matching fingerprint of the dataset), the index is
632    /// loaded from the rkyv cache file instead of being rebuilt from scratch.
633    ///
634    /// # Arguments
635    /// * `rules_path` - Path to dataset root containing rules/ and licenses/
636    /// * `cache_config` - Cache configuration (directory and reindex flag)
637    ///
638    /// # Returns
639    /// A Result containing the engine or an error
640    pub fn from_directory_with_cache(
641        rules_path: &Path,
642        cache_config: &LicenseCacheConfig,
643    ) -> Result<Self> {
644        let LoadedLicenseDataset {
645            manifest,
646            rules: loaded_rules,
647            licenses: loaded_licenses,
648        } = load_license_dataset_from_root(rules_path)?;
649
650        let fingerprint = compute_rules_fingerprint(&loaded_rules, &loaded_licenses)?;
651        let provenance = Some(LicenseIndexProvenance {
652            source: CUSTOM_LICENSE_DATASET_SOURCE.to_string(),
653            dataset_fingerprint: compute_dataset_fingerprint_string(
654                &loaded_rules,
655                &loaded_licenses,
656            )?,
657            ignored_rules: vec![],
658            ignored_licenses: vec![],
659            ignored_rules_due_to_licenses: vec![],
660            added_rules: vec![],
661            replaced_rules: vec![],
662            added_licenses: vec![],
663            replaced_licenses: vec![],
664        });
665
666        if !cache_config.reindex {
667            if let Some(cached) = load_cached_index(
668                cache_config,
669                LicenseCacheNamespace::CustomRules,
670                &fingerprint,
671            )? {
672                let start = Instant::now();
673                eprintln!(
674                    "License index loaded from rkyv cache in {:.2}s",
675                    start.elapsed().as_secs_f64()
676                );
677                return Self::from_index(
678                    cached,
679                    Some(manifest.spdx_license_list_version),
680                    provenance,
681                );
682            }
683        } else {
684            delete_cache(
685                cache_config,
686                LicenseCacheNamespace::CustomRules,
687                &fingerprint,
688            )?;
689        }
690
691        let start = Instant::now();
692        let index = build_index_from_loaded(loaded_rules, loaded_licenses, false);
693        eprintln!(
694            "License index built from custom dataset in {:.2}s",
695            start.elapsed().as_secs_f64()
696        );
697
698        if let Err(e) = save_cached_index(
699            cache_config,
700            LicenseCacheNamespace::CustomRules,
701            &index,
702            &fingerprint,
703        ) {
704            eprintln!("Warning: failed to save license index cache: {}", e);
705        } else if let Some(size) = cache_file_size(
706            cache_config,
707            LicenseCacheNamespace::CustomRules,
708            &fingerprint,
709        ) {
710            eprintln!(
711                "License index cache saved ({:.1} MB)",
712                size as f64 / 1_048_576.0
713            );
714        }
715
716        Self::from_index(index, Some(manifest.spdx_license_list_version), provenance)
717    }
718
719    pub fn embedded_spdx_license_list_version() -> Result<String> {
720        let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
721        Ok(load_embedded_artifact_metadata_from_bytes(artifact_bytes)
722            .map_err(|e| {
723                anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
724            })?
725            .spdx_license_list_version)
726    }
727
728    pub fn detect_with_kind(
729        &self,
730        text: &str,
731        unknown_licenses: bool,
732        binary_derived: bool,
733    ) -> Result<Vec<LicenseDetection>> {
734        self.detect_with_kind_with_score_and_deadline(
735            text,
736            unknown_licenses,
737            binary_derived,
738            0.0,
739            None,
740        )
741    }
742
743    pub fn detect_with_kind_with_score(
744        &self,
745        text: &str,
746        unknown_licenses: bool,
747        binary_derived: bool,
748        min_score: f32,
749    ) -> Result<Vec<LicenseDetection>> {
750        self.detect_with_kind_with_score_and_deadline(
751            text,
752            unknown_licenses,
753            binary_derived,
754            min_score,
755            None,
756        )
757    }
758
759    pub(crate) fn detect_with_kind_with_score_and_deadline(
760        &self,
761        text: &str,
762        unknown_licenses: bool,
763        binary_derived: bool,
764        min_score: f32,
765        deadline: Option<Instant>,
766    ) -> Result<Vec<LicenseDetection>> {
767        ensure_within_deadline(deadline)?;
768        let clean_text = strip_utf8_bom_str(text);
769
770        let content = truncate_detection_text(clean_text);
771
772        ensure_within_deadline(deadline)?;
773        let mut query = if deadline.is_some() {
774            Query::from_extracted_text_with_deadline(
775                content,
776                &self.index,
777                binary_derived,
778                deadline,
779            )?
780        } else {
781            Query::from_extracted_text(content, &self.index, binary_derived)?
782        };
783        let whole_query_run = query.whole_query_run();
784
785        let mut all_matches = Vec::new();
786        let mut candidate_contained_matches = Vec::new();
787        let mut aho_extra_matchables = PositionSet::new();
788        let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
789
790        // Phase 1a: Hash matching
791        // Python returns immediately if hash matches found (index.py:987-991)
792        {
793            ensure_within_deadline(deadline)?;
794            let hash_matches = hash_match(&self.index, &whole_query_run);
795
796            if !hash_matches.is_empty() {
797                let mut matches = hash_matches;
798                sort_matches_by_line(&mut matches);
799
800                let groups = group_matches_by_region(&matches);
801                let detections: Vec<LicenseDetection> = groups
802                    .iter()
803                    .map(|group| {
804                        let mut detection = empty_detection();
805                        populate_detection_from_group_with_spdx(
806                            &mut detection,
807                            group,
808                            &self.spdx_mapping,
809                            Some(content),
810                        );
811                        detection
812                    })
813                    .collect();
814
815                return Ok(post_process_detections(detections, min_score));
816            }
817        }
818
819        // Phase 1b: SPDX-LID matching
820        {
821            ensure_within_deadline(deadline)?;
822            let spdx_matches = spdx_lid_match(&self.index, &query);
823            subtract_spdx_match_qspans(
824                &mut query,
825                &mut matched_qspans,
826                &mut aho_extra_matchables,
827                &spdx_matches,
828            );
829            all_matches.extend(spdx_matches);
830        }
831
832        // Phase 1c: Aho-Corasick matching
833        {
834            ensure_within_deadline(deadline)?;
835            let aho_matches = if aho_extra_matchables.is_empty() {
836                if deadline.is_some() {
837                    aho_match::aho_match_with_deadline(&self.index, &whole_query_run, deadline)?
838                } else {
839                    aho_match(&self.index, &whole_query_run)
840                }
841            } else {
842                if deadline.is_some() {
843                    aho_match::aho_match_with_extra_matchables(
844                        &self.index,
845                        &whole_query_run,
846                        Some(&aho_extra_matchables),
847                        deadline,
848                    )?
849                } else {
850                    aho_match::aho_match_with_extra_matchables(
851                        &self.index,
852                        &whole_query_run,
853                        Some(&aho_extra_matchables),
854                        None,
855                    )?
856                }
857            };
858
859            // Python's get_exact_matches() calls refine_matches with merge=False
860            // This applies quality filters including required phrase filtering
861            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
862            candidate_contained_matches.extend(refined_aho.clone());
863            let (merged_aho, _) = merge_and_prepare_aho_matches(
864                &self.index,
865                &mut query,
866                &mut matched_qspans,
867                &refined_aho,
868            );
869            all_matches.extend(merged_aho);
870
871            let whole_query_followup = collect_whole_query_exact_followup_matches(
872                &self.index,
873                &mut query,
874                &mut matched_qspans,
875                &whole_query_run,
876                deadline,
877            )?;
878            all_matches.extend(whole_query_followup);
879
880            let merged_seq = collect_regular_seq_matches(
881                &self.index,
882                &query,
883                &matched_qspans,
884                &candidate_contained_matches,
885                deadline,
886            )?;
887            all_matches.extend(merged_seq);
888        }
889
890        // Step 1: Initial refine WITHOUT false positive filtering
891        // Python: refine_matches with filter_false_positive=False (index.py:1073-1080)
892        ensure_within_deadline(deadline)?;
893        let merged_matches =
894            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
895
896        // Step 2: Unknown detection and weak match handling
897        // Python: index.py:1079-1118 - only runs when unknown_licenses=True
898        let refined_matches = if unknown_licenses {
899            // Split weak from good - Python: index.py:1083
900            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
901
902            // Unknown detection on uncovered regions - Python: index.py:1093-1114
903            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
904            let filtered_unknown =
905                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
906
907            let mut all_matches = good_matches;
908            all_matches.extend(filtered_unknown);
909            // reinject weak matches and let refine matches keep the bests
910            // Python: index.py:1117-1118
911            all_matches.extend(weak_matches);
912            all_matches
913        } else {
914            merged_matches
915        };
916
917        // Step 5: Final refine WITH false positive filtering - Python: index.py:1130-1145
918        ensure_within_deadline(deadline)?;
919        let refined = refine_matches(&self.index, refined_matches, &query);
920
921        let mut sorted = refined;
922        sort_matches_by_line(&mut sorted);
923
924        let groups = group_matches_by_region(&sorted);
925
926        let detections: Vec<LicenseDetection> = groups
927            .iter()
928            .map(|group| {
929                let mut detection = empty_detection();
930                populate_detection_from_group_with_spdx(
931                    &mut detection,
932                    group,
933                    &self.spdx_mapping,
934                    Some(content),
935                );
936                detection
937            })
938            .collect();
939
940        let detections = post_process_detections(detections, min_score);
941
942        ensure_within_deadline(deadline)?;
943        Ok(detections)
944    }
945
946    pub fn detect_with_kind_and_source(
947        &self,
948        text: &str,
949        unknown_licenses: bool,
950        binary_derived: bool,
951        source_path: &str,
952    ) -> Result<Vec<LicenseDetection>> {
953        self.detect_with_kind_and_source_with_deadline(
954            text,
955            unknown_licenses,
956            binary_derived,
957            source_path,
958            None,
959        )
960    }
961
962    pub(crate) fn detect_with_kind_and_source_with_deadline(
963        &self,
964        text: &str,
965        unknown_licenses: bool,
966        binary_derived: bool,
967        source_path: &str,
968        deadline: Option<Instant>,
969    ) -> Result<Vec<LicenseDetection>> {
970        let mut detections = self.detect_with_kind_with_score_and_deadline(
971            text,
972            unknown_licenses,
973            binary_derived,
974            0.0,
975            deadline,
976        )?;
977        attach_source_path_to_detections(&mut detections, source_path);
978        Ok(detections)
979    }
980
981    pub fn detect_with_kind_and_source_with_score(
982        &self,
983        text: &str,
984        unknown_licenses: bool,
985        binary_derived: bool,
986        source_path: &str,
987        min_score: f32,
988    ) -> Result<Vec<LicenseDetection>> {
989        let mut detections =
990            self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, min_score)?;
991        attach_source_path_to_detections(&mut detections, source_path);
992        Ok(detections)
993    }
994
995    pub(crate) fn detect_with_kind_and_source_with_score_and_deadline(
996        &self,
997        text: &str,
998        unknown_licenses: bool,
999        binary_derived: bool,
1000        source_path: &str,
1001        min_score: f32,
1002        deadline: Option<Instant>,
1003    ) -> Result<Vec<LicenseDetection>> {
1004        let mut detections = self.detect_with_kind_with_score_and_deadline(
1005            text,
1006            unknown_licenses,
1007            binary_derived,
1008            min_score,
1009            deadline,
1010        )?;
1011        attach_source_path_to_detections(&mut detections, source_path);
1012        Ok(detections)
1013    }
1014
1015    /// Detect licenses and return raw matches (like Python's idx.match()).
1016    ///
1017    /// This is primarily used by golden tests and maintenance tooling that need
1018    /// raw match sequences before grouping or post-processing into detections.
1019    #[cfg(any(test, feature = "golden-tests"))]
1020    pub fn detect_matches_with_kind(
1021        &self,
1022        text: &str,
1023        unknown_licenses: bool,
1024        binary_derived: bool,
1025    ) -> Result<Vec<LicenseMatch>> {
1026        let clean_text = strip_utf8_bom_str(text);
1027
1028        let content = truncate_detection_text(clean_text);
1029
1030        let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
1031        let whole_query_run = query.whole_query_run();
1032
1033        let mut all_matches = Vec::new();
1034        let mut candidate_contained_matches = Vec::new();
1035        let mut aho_extra_matchables = PositionSet::new();
1036        let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
1037
1038        // Phase 1a: Hash matching
1039        {
1040            let hash_matches = hash_match(&self.index, &whole_query_run);
1041
1042            if !hash_matches.is_empty() {
1043                let mut matches = hash_matches;
1044                sort_matches_by_line(&mut matches);
1045                return Ok(matches);
1046            }
1047        }
1048
1049        // Phase 1b: SPDX-LID matching
1050        {
1051            let spdx_matches = spdx_lid_match(&self.index, &query);
1052            subtract_spdx_match_qspans(
1053                &mut query,
1054                &mut matched_qspans,
1055                &mut aho_extra_matchables,
1056                &spdx_matches,
1057            );
1058            all_matches.extend(spdx_matches);
1059        }
1060
1061        // Phase 1c: Aho-Corasick matching
1062        {
1063            let aho_matches = if aho_extra_matchables.is_empty() {
1064                aho_match(&self.index, &whole_query_run)
1065            } else {
1066                aho_match::aho_match_with_extra_matchables(
1067                    &self.index,
1068                    &whole_query_run,
1069                    Some(&aho_extra_matchables),
1070                    None,
1071                )?
1072            };
1073            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
1074            candidate_contained_matches.extend(refined_aho.clone());
1075            let (merged_aho, _) = merge_and_prepare_aho_matches(
1076                &self.index,
1077                &mut query,
1078                &mut matched_qspans,
1079                &refined_aho,
1080            );
1081            all_matches.extend(merged_aho);
1082
1083            let whole_query_followup = collect_whole_query_exact_followup_matches(
1084                &self.index,
1085                &mut query,
1086                &mut matched_qspans,
1087                &whole_query_run,
1088                None,
1089            )?;
1090            all_matches.extend(whole_query_followup);
1091
1092            let merged_seq = collect_regular_seq_matches(
1093                &self.index,
1094                &query,
1095                &matched_qspans,
1096                &candidate_contained_matches,
1097                None,
1098            )?;
1099            all_matches.extend(merged_seq);
1100        }
1101
1102        // Step 1: Initial refine WITHOUT false positive filtering
1103        let merged_matches =
1104            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
1105
1106        // Step 2: Unknown detection and weak match handling
1107        let refined_matches = if unknown_licenses {
1108            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
1109            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
1110            let filtered_unknown =
1111                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
1112
1113            let mut all_matches = good_matches;
1114            all_matches.extend(filtered_unknown);
1115            all_matches.extend(weak_matches);
1116            all_matches
1117        } else {
1118            merged_matches
1119        };
1120
1121        // Step 3: Final refine WITH false positive filtering - Python: index.py:1130-1145
1122        let refined = refine_matches(&self.index, refined_matches, &query);
1123
1124        let mut sorted = refined;
1125        sort_matches_by_line(&mut sorted);
1126
1127        // Return raw matches (NOT grouped) - this is Python's idx.match() behavior
1128        Ok(sorted)
1129    }
1130
1131    /// Get a reference to the license index.
1132    pub fn index(&self) -> &index::LicenseIndex {
1133        &self.index
1134    }
1135
1136    pub fn spdx_license_list_version(&self) -> Option<&str> {
1137        self.spdx_license_list_version.as_deref()
1138    }
1139
1140    pub fn license_index_provenance(&self) -> Option<&LicenseIndexProvenance> {
1141        self.license_index_provenance.as_ref()
1142    }
1143
1144    /// Get a reference to the SPDX mapping.
1145    #[cfg(test)]
1146    pub fn spdx_mapping(&self) -> &SpdxMapping {
1147        &self.spdx_mapping
1148    }
1149}
1150
1151pub fn detect_scancode_spdx_license_list_version(search_path: &Path) -> Result<Option<String>> {
1152    for ancestor in search_path.ancestors() {
1153        let candidate = ancestor.join("scancode_config.py");
1154        if candidate.is_file() {
1155            let config = fs::read_to_string(&candidate)?;
1156            return Ok(parse_scancode_spdx_license_list_version(&config));
1157        }
1158    }
1159
1160    Ok(None)
1161}
1162
1163fn parse_scancode_spdx_license_list_version(config: &str) -> Option<String> {
1164    config.lines().find_map(|line| {
1165        let trimmed = line.trim();
1166        let (_, value) = trimmed.split_once('=')?;
1167        (trimmed.starts_with("spdx_license_list_version")).then(|| {
1168            value
1169                .trim()
1170                .trim_matches('"')
1171                .trim_matches('\'')
1172                .to_string()
1173        })
1174    })
1175}
1176
1177#[cfg(test)]
1178mod tests;