Skip to main content

provenant/license_detection/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! License Detection Engine
5
6pub mod aho_match;
7pub mod automaton;
8pub mod build_policy;
9pub mod dataset;
10pub(crate) mod detection;
11pub mod embedded;
12pub mod license_cache;
13mod position_set;
14mod token_multiset;
15mod token_set;
16
17#[cfg(test)]
18mod embedded_test;
19pub mod expression;
20#[cfg(feature = "golden-tests")]
21pub mod golden_utils;
22pub mod hash_match;
23pub mod index;
24mod match_refine;
25pub mod models;
26pub mod query;
27pub mod rules;
28pub mod seq_match;
29pub mod spdx_lid;
30pub mod spdx_mapping;
31#[cfg(test)]
32mod test_utils;
33pub mod tokenize;
34pub mod unknown_match;
35
36use bit_set::BitSet;
37use std::collections::HashSet;
38use std::fs;
39use std::path::Path;
40use std::sync::Arc;
41use std::time::Instant;
42
43use anyhow::Result;
44
45use crate::license_detection::build_policy::EMBEDDED_LICENSE_INDEX_SOURCE;
46use crate::license_detection::dataset::{
47    CUSTOM_LICENSE_DATASET_SOURCE, LoadedLicenseDataset, compute_dataset_fingerprint_string,
48    load_license_dataset_from_root,
49};
50use crate::license_detection::embedded::index::{
51    load_embedded_artifact_metadata_from_bytes, load_loader_snapshot_from_bytes,
52};
53use crate::license_detection::index::build_index_from_loaded;
54use crate::license_detection::license_cache::{
55    LicenseCacheConfig, LicenseCacheNamespace, cache_file_size, compute_artifact_fingerprint,
56    compute_rules_fingerprint, delete_cache, load_cached_index, save_cached_index,
57};
58use crate::license_detection::query::Query;
59use crate::license_detection::spdx_mapping::{SpdxMapping, build_spdx_mapping};
60use crate::models::LicenseIndexProvenance;
61use crate::utils::text::strip_utf8_bom_str;
62
63use crate::license_detection::detection::{
64    attach_source_path_to_detections, empty_detection, populate_detection_from_group_with_spdx,
65    split_groups_across_frontmatter_boundary,
66};
67use crate::license_detection::models::MatcherKind;
68
69/// Path to the license rules directory in the reference scancode-toolkit submodule.
70/// Used by test code and the xtask generate-license-loader-artifact binary.
71#[allow(dead_code)]
72pub const SCANCODE_LICENSES_RULES_PATH: &str =
73    "reference/scancode-toolkit/src/licensedcode/data/rules";
74
75/// Path to the licenses directory in the reference scancode-toolkit submodule.
76/// Used by test code and the xtask generate-license-loader-artifact binary.
77#[allow(dead_code)]
78pub const SCANCODE_LICENSES_LICENSES_PATH: &str =
79    "reference/scancode-toolkit/src/licensedcode/data/licenses";
80
81/// Path to the license data directory in the reference scancode-toolkit submodule.
82/// Used by test code and the xtask generate-license-loader-artifact binary.
83#[allow(dead_code)]
84pub const SCANCODE_LICENSES_DATA_PATH: &str = "reference/scancode-toolkit/src/licensedcode/data";
85
86pub const DEFAULT_LICENSEDB_URL_TEMPLATE: &str = "https://scancode-licensedb.aboutcode.org/{}";
87pub(crate) const LICENSE_DETECTION_TIMEOUT_MESSAGE: &str = "license detection timed out";
88
89pub(crate) use detection::{
90    LicenseDetection, group_matches_by_region, post_process_detections, sort_matches_by_line,
91};
92pub use models::LicenseMatch;
93
94pub use aho_match::aho_match;
95pub use hash_match::hash_match;
96pub use match_refine::{
97    filter_invalid_contained_unknown_matches, merge_overlapping_matches, refine_matches,
98    refine_matches_without_false_positive_filter, split_weak_matches,
99};
100pub use position_set::PositionSet;
101pub use spdx_lid::spdx_lid_match;
102pub use token_multiset::TokenMultiset;
103pub use token_set::TokenSet;
104pub use unknown_match::unknown_match;
105
106use self::seq_match::{
107    MAX_NEAR_DUPE_CANDIDATES, select_seq_candidates_with_deadline,
108    seq_match_with_candidates_and_deadline,
109};
110
111/// License detection engine that orchestrates the detection pipeline.
112///
113/// The engine loads license rules and builds an index for efficient matching.
114/// It supports multiple matching strategies (hash, SPDX-LID, Aho-Corasick, sequence)
115/// and combines their results into final license detections.
116#[derive(Debug, Clone)]
117pub struct LicenseDetectionEngine {
118    index: Arc<index::LicenseIndex>,
119    spdx_mapping: SpdxMapping,
120    spdx_license_list_version: Option<String>,
121    license_index_provenance: Option<LicenseIndexProvenance>,
122}
123
124const MAX_DETECTION_SIZE: usize = 10 * 1024 * 1024; // 10MB
125const MAX_REGULAR_SEQ_CANDIDATES: usize = 70;
126const MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP: usize = 8;
127const MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP: usize = 2;
128
129pub(crate) fn deadline_exceeded(deadline: Option<Instant>) -> bool {
130    deadline.is_some_and(|deadline| Instant::now() >= deadline)
131}
132
133pub(crate) fn ensure_within_deadline(deadline: Option<Instant>) -> Result<()> {
134    if deadline_exceeded(deadline) {
135        Err(anyhow::anyhow!(LICENSE_DETECTION_TIMEOUT_MESSAGE))
136    } else {
137        Ok(())
138    }
139}
140
141fn truncate_detection_text(clean_text: &str) -> &str {
142    if clean_text.len() <= MAX_DETECTION_SIZE {
143        return clean_text;
144    }
145
146    log::debug!(
147        "Content size {} exceeds limit {}, truncating for detection",
148        clean_text.len(),
149        MAX_DETECTION_SIZE
150    );
151
152    let boundary = clean_text.floor_char_boundary(MAX_DETECTION_SIZE);
153    &clean_text[..boundary]
154}
155
156fn query_span_for_match(m: &LicenseMatch) -> Option<models::PositionSpan> {
157    (!m.query_span().is_empty()).then(|| m.query_span().clone())
158}
159
160fn has_full_match_coverage(m: &LicenseMatch) -> bool {
161    m.coverage() == 100.0
162}
163
164fn is_redundant_same_expression_seq_container(
165    container: &LicenseMatch,
166    candidate_contained_matches: &[LicenseMatch],
167) -> bool {
168    let container_is_redundant_coverage =
169        has_full_match_coverage(container) || container.coverage() >= 99.0;
170    if container.matcher != MatcherKind::Seq || !container_is_redundant_coverage {
171        return false;
172    }
173
174    let container_qspan_set = container.qspan_set();
175
176    let mut contained: Vec<&LicenseMatch> = candidate_contained_matches
177        .iter()
178        .filter(|m| {
179            m.matcher == MatcherKind::Aho
180                && has_full_match_coverage(m)
181                && m.license_expression == container.license_expression
182                && m.overlaps_with(&container_qspan_set)
183        })
184        .collect();
185
186    if contained.len() < 2 {
187        return false;
188    }
189
190    let material_children = contained.iter().filter(|m| m.matched_length > 1).count();
191    if material_children < 2 {
192        return false;
193    }
194
195    contained.sort_by_key(|m| m.qspan_bounds());
196
197    let mut child_union = PositionSet::new();
198    for m in &contained {
199        child_union.extend_from_span(m.query_span());
200    }
201
202    let container_only_positions = container_qspan_set.difference(&child_union);
203    let child_only_positions = child_union.difference(&container_qspan_set);
204
205    let mut bridge_positions = BitSet::new();
206    for pair in contained.windows(2) {
207        let (_, previous_end) = pair[0].qspan_bounds();
208        let (next_start, _) = pair[1].qspan_bounds();
209
210        if next_start < previous_end {
211            return false;
212        }
213
214        for pos in previous_end..next_start {
215            bridge_positions.insert(pos);
216        }
217    }
218
219    let container_only_boundary_positions = container_only_positions
220        .iter()
221        .filter(|&pos| !bridge_positions.contains(pos))
222        .count();
223
224    if container_only_positions.len() == 1
225        && container_only_boundary_positions == 0
226        && child_only_positions.is_empty()
227    {
228        return false;
229    }
230
231    if child_only_positions.is_empty()
232        && container_only_positions.len() == container_only_boundary_positions
233        && container_only_boundary_positions <= 3
234    {
235        let earliest_child = contained
236            .iter()
237            .map(|m| m.qspan_bounds().0)
238            .min()
239            .unwrap_or(usize::MAX);
240        let latest_child = contained
241            .iter()
242            .map(|m| m.qspan_bounds().1.saturating_sub(1))
243            .max()
244            .unwrap_or(0);
245
246        let is_one_sided_boundary = container_only_positions
247            .iter()
248            .all(|pos| pos < earliest_child)
249            || container_only_positions
250                .iter()
251                .all(|pos| pos > latest_child);
252
253        if is_one_sided_boundary {
254            return false;
255        }
256    }
257
258    let max_container_only_positions =
259        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * contained.len() + 1;
260    let max_container_boundary_positions =
261        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * (contained.len() - 1);
262    let max_child_only_positions = MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP + 1;
263
264    container_only_positions.len() <= max_container_only_positions
265        && container_only_boundary_positions <= max_container_boundary_positions
266        && child_only_positions.len() <= max_child_only_positions
267}
268
269fn filter_redundant_same_expression_seq_containers(
270    seq_matches: Vec<LicenseMatch>,
271    candidate_contained_matches: &[LicenseMatch],
272) -> Vec<LicenseMatch> {
273    seq_matches
274        .into_iter()
275        .filter(|m| !is_redundant_same_expression_seq_container(m, candidate_contained_matches))
276        .collect()
277}
278
279fn is_redundant_low_coverage_composite_seq_wrapper(
280    container: &LicenseMatch,
281    candidate_contained_matches: &[LicenseMatch],
282) -> bool {
283    if container.matcher != seq_match::MATCH_SEQ || container.coverage() >= 30.0 {
284        return false;
285    }
286
287    let container_qspan_set = container.qspan_set();
288
289    let children: Vec<&LicenseMatch> = candidate_contained_matches
290        .iter()
291        .filter(|m| {
292            m.matcher == aho_match::MATCH_AHO
293                && has_full_match_coverage(m)
294                && m.license_expression != container.license_expression
295                && m.overlaps_with(&container_qspan_set)
296        })
297        .collect();
298
299    if children.len() < 2 {
300        return false;
301    }
302
303    let unique_expressions: HashSet<&str> = children
304        .iter()
305        .map(|m| m.license_expression.as_str())
306        .collect();
307    if unique_expressions.len() < 2 {
308        return false;
309    }
310
311    let mut child_union = PositionSet::new();
312    for m in &children {
313        child_union.extend_from_span(m.query_span());
314    }
315
316    let container_only_positions = container_qspan_set.difference(&child_union);
317    let child_only_positions = child_union.difference(&container_qspan_set);
318
319    let mut sorted_children = children;
320    sorted_children.sort_by_key(|m| m.qspan_bounds());
321
322    let mut bridge_positions = BitSet::new();
323    for pair in sorted_children.windows(2) {
324        let (_, previous_end) = pair[0].qspan_bounds();
325        let (next_start, _) = pair[1].qspan_bounds();
326        for pos in previous_end..next_start {
327            bridge_positions.insert(pos);
328        }
329    }
330
331    let container_only_boundary_positions = container_only_positions
332        .iter()
333        .filter(|&pos| !bridge_positions.contains(pos))
334        .count();
335
336    child_only_positions.is_empty()
337        && container_only_positions.len() <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
338        && container_only_boundary_positions <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
339}
340
341fn filter_redundant_low_coverage_composite_seq_wrappers(
342    seq_matches: Vec<LicenseMatch>,
343    candidate_contained_matches: &[LicenseMatch],
344) -> Vec<LicenseMatch> {
345    seq_matches
346        .into_iter()
347        .filter(|m| {
348            !is_redundant_low_coverage_composite_seq_wrapper(m, candidate_contained_matches)
349        })
350        .collect()
351}
352
353fn subtract_spdx_match_qspans(
354    query: &mut Query<'_>,
355    matched_qspans: &mut Vec<models::PositionSpan>,
356    aho_extra_matchables: &mut PositionSet,
357    spdx_matches: &[LicenseMatch],
358) {
359    for m in spdx_matches {
360        let Some(span) = query_span_for_match(m) else {
361            continue;
362        };
363
364        aho_extra_matchables.extend_from_span(&span);
365        query.subtract(&span);
366
367        if has_full_match_coverage(m) {
368            matched_qspans.push(span);
369        }
370    }
371}
372
373fn merge_and_prepare_aho_matches(
374    index: &index::LicenseIndex,
375    query: &mut Query<'_>,
376    matched_qspans: &mut Vec<models::PositionSpan>,
377    refined_aho: &[LicenseMatch],
378) -> (Vec<LicenseMatch>, bool) {
379    let merged_aho = merge_overlapping_matches(refined_aho);
380    let mut saw_long_exact_license_text_match = false;
381
382    for m in &merged_aho {
383        let Some(span) = query_span_for_match(m) else {
384            continue;
385        };
386
387        if has_full_match_coverage(m) {
388            matched_qspans.push(span.clone());
389        }
390
391        if index.rule(m.rid).is_some_and(|rule| rule.is_license_text())
392            && m.rule_length > 120
393            && m.coverage() > 98.0
394        {
395            query.subtract(&span);
396            saw_long_exact_license_text_match = true;
397        }
398    }
399
400    (merged_aho, saw_long_exact_license_text_match)
401}
402
403fn collect_whole_query_exact_followup_matches(
404    index: &index::LicenseIndex,
405    query: &mut Query<'_>,
406    matched_qspans: &mut Vec<models::PositionSpan>,
407    whole_run: &query::QueryRun<'_>,
408    deadline: Option<Instant>,
409) -> Result<Vec<LicenseMatch>> {
410    let mut seq_all_matches = Vec::new();
411
412    if whole_run.is_matchable(false, matched_qspans) {
413        let near_dupe_candidates = if deadline.is_some() {
414            select_seq_candidates_with_deadline(
415                index,
416                whole_run,
417                true,
418                MAX_NEAR_DUPE_CANDIDATES,
419                deadline,
420            )?
421        } else {
422            self::seq_match::select_seq_candidates(index, whole_run, true, MAX_NEAR_DUPE_CANDIDATES)
423        };
424
425        if !near_dupe_candidates.is_empty() {
426            let near_dupe_matches = if deadline.is_some() {
427                seq_match_with_candidates_and_deadline(
428                    index,
429                    whole_run,
430                    &near_dupe_candidates,
431                    deadline,
432                )?
433            } else {
434                self::seq_match::seq_match_with_candidates(index, whole_run, &near_dupe_candidates)
435            };
436
437            for m in &near_dupe_matches {
438                if !m.query_span().is_empty() {
439                    let span = m.query_span().clone();
440                    query.subtract(&span);
441                    matched_qspans.push(span);
442                }
443            }
444
445            seq_all_matches.extend(near_dupe_matches);
446        }
447    }
448
449    Ok(seq_all_matches)
450}
451
452fn collect_regular_seq_matches(
453    index: &index::LicenseIndex,
454    query: &Query<'_>,
455    matched_qspans: &[models::PositionSpan],
456    candidate_contained_matches: &[LicenseMatch],
457    deadline: Option<Instant>,
458) -> Result<Vec<LicenseMatch>> {
459    let mut seq_all_matches = Vec::new();
460
461    for (query_run_index, query_run) in query.query_runs().into_iter().enumerate() {
462        if query_run_index % 8 == 0 {
463            ensure_within_deadline(deadline)?;
464        }
465
466        if !query_run.is_matchable(false, matched_qspans) {
467            continue;
468        }
469
470        let candidates = if deadline.is_some() {
471            select_seq_candidates_with_deadline(
472                index,
473                &query_run,
474                false,
475                MAX_REGULAR_SEQ_CANDIDATES,
476                deadline,
477            )?
478        } else {
479            self::seq_match::select_seq_candidates(
480                index,
481                &query_run,
482                false,
483                MAX_REGULAR_SEQ_CANDIDATES,
484            )
485        };
486        if !candidates.is_empty() {
487            let matches = if deadline.is_some() {
488                seq_match_with_candidates_and_deadline(index, &query_run, &candidates, deadline)?
489            } else {
490                self::seq_match::seq_match_with_candidates(index, &query_run, &candidates)
491            };
492            seq_all_matches.extend(matches);
493        }
494    }
495
496    let merged_seq = merge_overlapping_matches(&seq_all_matches);
497    let filtered_same_expression =
498        filter_redundant_same_expression_seq_containers(merged_seq, candidate_contained_matches);
499    Ok(filter_redundant_low_coverage_composite_seq_wrappers(
500        filtered_same_expression,
501        candidate_contained_matches,
502    ))
503}
504
505impl LicenseDetectionEngine {
506    /// Create a new license detection engine from a pre-built license index.
507    ///
508    /// This is an internal constructor used by `from_directory()` and `from_embedded()`.
509    /// It builds the SPDX mapping from the licenses in the index.
510    fn from_index(
511        index: index::LicenseIndex,
512        spdx_license_list_version: Option<String>,
513        license_index_provenance: Option<LicenseIndexProvenance>,
514    ) -> Result<Self> {
515        let mut license_vec: Vec<_> = index.licenses_by_key.values().cloned().collect();
516        license_vec.sort_by(|a, b| a.key.cmp(&b.key));
517        let spdx_mapping = build_spdx_mapping(&license_vec);
518
519        Ok(Self {
520            index: Arc::new(index),
521            spdx_mapping,
522            spdx_license_list_version,
523            license_index_provenance,
524        })
525    }
526
527    #[cfg(test)]
528    pub(crate) fn from_test_index(index: index::LicenseIndex) -> Self {
529        Self::from_index(index, None, None).expect("test index should build license engine")
530    }
531
532    /// Create a new license detection engine from the embedded license index.
533    ///
534    /// Convenience method that uses the default Provenant cache root and does
535    /// not force a reindex.
536    pub fn from_embedded() -> Result<Self> {
537        let cache_config =
538            LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
539        Self::from_embedded_with_cache(&cache_config)
540    }
541
542    /// Create a new license detection engine from the embedded license index.
543    ///
544    /// This method loads the build-time embedded license artifact and constructs
545    /// the runtime license index. This eliminates the runtime dependency on the
546    /// ScanCode rules directory.
547    ///
548    /// If a valid cache exists (matching fingerprint), the index is loaded from
549    /// the rkyv cache file instead of being rebuilt from scratch.
550    ///
551    /// # Arguments
552    /// * `cache_config` - Cache configuration (directory and reindex flag)
553    ///
554    /// # Returns
555    /// A Result containing the engine or an error
556    pub fn from_embedded_with_cache(cache_config: &LicenseCacheConfig) -> Result<Self> {
557        let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
558        let fingerprint = compute_artifact_fingerprint(artifact_bytes);
559        let artifact_metadata = load_embedded_artifact_metadata_from_bytes(artifact_bytes)
560            .map_err(|e| {
561                anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
562            })?;
563        debug_assert_eq!(
564            artifact_metadata.license_index_provenance.source,
565            EMBEDDED_LICENSE_INDEX_SOURCE
566        );
567        let spdx_version = Some(artifact_metadata.spdx_license_list_version.clone());
568        let provenance = Some(artifact_metadata.license_index_provenance.clone());
569
570        if !cache_config.reindex {
571            if let Some(cached) =
572                load_cached_index(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?
573            {
574                let start = Instant::now();
575                eprintln!(
576                    "License index loaded from rkyv cache in {:.2}s",
577                    start.elapsed().as_secs_f64()
578                );
579                return Self::from_index(cached, spdx_version, provenance);
580            }
581        } else {
582            delete_cache(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?;
583        }
584
585        let snapshot = load_loader_snapshot_from_bytes(artifact_bytes)
586            .map_err(|e| anyhow::anyhow!("Failed to load embedded license index: {}", e))?;
587        let spdx_version = Some(snapshot.metadata.spdx_license_list_version.clone());
588        let provenance = Some(snapshot.metadata.license_index_provenance.clone());
589
590        let start = Instant::now();
591        let index = build_index_from_loaded(snapshot.rules, snapshot.licenses, false);
592        eprintln!(
593            "License index built from embedded artifact in {:.2}s",
594            start.elapsed().as_secs_f64()
595        );
596
597        let mut index = index;
598        index.spdx_license_list_version = spdx_version.clone();
599        if let Err(e) = save_cached_index(
600            cache_config,
601            LicenseCacheNamespace::Embedded,
602            &index,
603            &fingerprint,
604        ) {
605            eprintln!("Warning: failed to save license index cache: {}", e);
606        } else if let Some(size) =
607            cache_file_size(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)
608        {
609            eprintln!(
610                "License index cache saved ({:.1} MB)",
611                size as f64 / 1_048_576.0
612            );
613        }
614
615        Self::from_index(index, spdx_version, provenance)
616    }
617
618    /// Create a new license detection engine from a license dataset root.
619    ///
620    /// Convenience method that uses the default Provenant cache root and does
621    /// not force a reindex.
622    pub fn from_directory(rules_path: &Path) -> Result<Self> {
623        let cache_config =
624            LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
625        Self::from_directory_with_cache(rules_path, &cache_config)
626    }
627
628    /// Create a new license detection engine from a directory of license rules.
629    ///
630    /// If a valid cache exists (matching fingerprint of the dataset), the index is
631    /// loaded from the rkyv cache file instead of being rebuilt from scratch.
632    ///
633    /// # Arguments
634    /// * `rules_path` - Path to dataset root containing rules/ and licenses/
635    /// * `cache_config` - Cache configuration (directory and reindex flag)
636    ///
637    /// # Returns
638    /// A Result containing the engine or an error
639    pub fn from_directory_with_cache(
640        rules_path: &Path,
641        cache_config: &LicenseCacheConfig,
642    ) -> Result<Self> {
643        let LoadedLicenseDataset {
644            manifest,
645            rules: loaded_rules,
646            licenses: loaded_licenses,
647        } = load_license_dataset_from_root(rules_path)?;
648
649        let fingerprint = compute_rules_fingerprint(&loaded_rules, &loaded_licenses)?;
650        let provenance = Some(LicenseIndexProvenance {
651            source: CUSTOM_LICENSE_DATASET_SOURCE.to_string(),
652            dataset_fingerprint: compute_dataset_fingerprint_string(
653                &loaded_rules,
654                &loaded_licenses,
655            )?,
656            ignored_rules: vec![],
657            ignored_licenses: vec![],
658            ignored_rules_due_to_licenses: vec![],
659            added_rules: vec![],
660            replaced_rules: vec![],
661            added_licenses: vec![],
662            replaced_licenses: vec![],
663        });
664
665        if !cache_config.reindex {
666            if let Some(cached) = load_cached_index(
667                cache_config,
668                LicenseCacheNamespace::CustomRules,
669                &fingerprint,
670            )? {
671                let start = Instant::now();
672                eprintln!(
673                    "License index loaded from rkyv cache in {:.2}s",
674                    start.elapsed().as_secs_f64()
675                );
676                return Self::from_index(
677                    cached,
678                    Some(manifest.spdx_license_list_version),
679                    provenance,
680                );
681            }
682        } else {
683            delete_cache(
684                cache_config,
685                LicenseCacheNamespace::CustomRules,
686                &fingerprint,
687            )?;
688        }
689
690        let start = Instant::now();
691        let index = build_index_from_loaded(loaded_rules, loaded_licenses, false);
692        eprintln!(
693            "License index built from custom dataset in {:.2}s",
694            start.elapsed().as_secs_f64()
695        );
696
697        if let Err(e) = save_cached_index(
698            cache_config,
699            LicenseCacheNamespace::CustomRules,
700            &index,
701            &fingerprint,
702        ) {
703            eprintln!("Warning: failed to save license index cache: {}", e);
704        } else if let Some(size) = cache_file_size(
705            cache_config,
706            LicenseCacheNamespace::CustomRules,
707            &fingerprint,
708        ) {
709            eprintln!(
710                "License index cache saved ({:.1} MB)",
711                size as f64 / 1_048_576.0
712            );
713        }
714
715        Self::from_index(index, Some(manifest.spdx_license_list_version), provenance)
716    }
717
718    pub fn embedded_spdx_license_list_version() -> Result<String> {
719        let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
720        Ok(load_embedded_artifact_metadata_from_bytes(artifact_bytes)
721            .map_err(|e| {
722                anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
723            })?
724            .spdx_license_list_version)
725    }
726
727    pub fn detect_with_kind(
728        &self,
729        text: &str,
730        unknown_licenses: bool,
731        binary_derived: bool,
732    ) -> Result<Vec<LicenseDetection>> {
733        self.detect_with_kind_with_score_and_deadline(
734            text,
735            unknown_licenses,
736            binary_derived,
737            0.0,
738            None,
739        )
740    }
741
742    pub fn detect_with_kind_with_score(
743        &self,
744        text: &str,
745        unknown_licenses: bool,
746        binary_derived: bool,
747        min_score: f32,
748    ) -> Result<Vec<LicenseDetection>> {
749        self.detect_with_kind_with_score_and_deadline(
750            text,
751            unknown_licenses,
752            binary_derived,
753            min_score,
754            None,
755        )
756    }
757
758    pub(crate) fn detect_with_kind_with_score_and_deadline(
759        &self,
760        text: &str,
761        unknown_licenses: bool,
762        binary_derived: bool,
763        min_score: f32,
764        deadline: Option<Instant>,
765    ) -> Result<Vec<LicenseDetection>> {
766        ensure_within_deadline(deadline)?;
767        let clean_text = strip_utf8_bom_str(text);
768
769        let content = truncate_detection_text(clean_text);
770
771        ensure_within_deadline(deadline)?;
772        let mut query = if deadline.is_some() {
773            Query::from_extracted_text_with_deadline(
774                content,
775                &self.index,
776                binary_derived,
777                deadline,
778            )?
779        } else {
780            Query::from_extracted_text(content, &self.index, binary_derived)?
781        };
782        let whole_query_run = query.whole_query_run();
783
784        let mut all_matches = Vec::new();
785        let mut candidate_contained_matches = Vec::new();
786        let mut aho_extra_matchables = PositionSet::new();
787        let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
788
789        // Phase 1a: Hash matching
790        // Python returns immediately if hash matches found (index.py:987-991)
791        {
792            ensure_within_deadline(deadline)?;
793            let hash_matches = hash_match(&self.index, &whole_query_run);
794
795            if !hash_matches.is_empty() {
796                let mut matches = hash_matches;
797                sort_matches_by_line(&mut matches);
798
799                let groups = split_groups_across_frontmatter_boundary(
800                    group_matches_by_region(&matches),
801                    Some(content),
802                );
803                let detections: Vec<LicenseDetection> = groups
804                    .iter()
805                    .map(|group| {
806                        let mut detection = empty_detection();
807                        populate_detection_from_group_with_spdx(
808                            &mut detection,
809                            group,
810                            &self.spdx_mapping,
811                            Some(content),
812                        );
813                        detection
814                    })
815                    .collect();
816
817                return Ok(post_process_detections(detections, min_score));
818            }
819        }
820
821        // Phase 1b: SPDX-LID matching
822        {
823            ensure_within_deadline(deadline)?;
824            let spdx_matches = spdx_lid_match(&self.index, &query);
825            subtract_spdx_match_qspans(
826                &mut query,
827                &mut matched_qspans,
828                &mut aho_extra_matchables,
829                &spdx_matches,
830            );
831            all_matches.extend(spdx_matches);
832        }
833
834        // Phase 1c: Aho-Corasick matching
835        {
836            ensure_within_deadline(deadline)?;
837            let aho_matches = if aho_extra_matchables.is_empty() {
838                if deadline.is_some() {
839                    aho_match::aho_match_with_deadline(&self.index, &whole_query_run, deadline)?
840                } else {
841                    aho_match(&self.index, &whole_query_run)
842                }
843            } else {
844                if deadline.is_some() {
845                    aho_match::aho_match_with_extra_matchables(
846                        &self.index,
847                        &whole_query_run,
848                        Some(&aho_extra_matchables),
849                        deadline,
850                    )?
851                } else {
852                    aho_match::aho_match_with_extra_matchables(
853                        &self.index,
854                        &whole_query_run,
855                        Some(&aho_extra_matchables),
856                        None,
857                    )?
858                }
859            };
860
861            // Python's get_exact_matches() calls refine_matches with merge=False
862            // This applies quality filters including required phrase filtering
863            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
864            candidate_contained_matches.extend(refined_aho.clone());
865            let (merged_aho, _) = merge_and_prepare_aho_matches(
866                &self.index,
867                &mut query,
868                &mut matched_qspans,
869                &refined_aho,
870            );
871            all_matches.extend(merged_aho);
872
873            let whole_query_followup = collect_whole_query_exact_followup_matches(
874                &self.index,
875                &mut query,
876                &mut matched_qspans,
877                &whole_query_run,
878                deadline,
879            )?;
880            all_matches.extend(whole_query_followup);
881
882            let merged_seq = collect_regular_seq_matches(
883                &self.index,
884                &query,
885                &matched_qspans,
886                &candidate_contained_matches,
887                deadline,
888            )?;
889            all_matches.extend(merged_seq);
890        }
891
892        // Step 1: Initial refine WITHOUT false positive filtering
893        // Python: refine_matches with filter_false_positive=False (index.py:1073-1080)
894        ensure_within_deadline(deadline)?;
895        let merged_matches =
896            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
897
898        // Step 2: Unknown detection and weak match handling
899        // Python: index.py:1079-1118 - only runs when unknown_licenses=True
900        let refined_matches = if unknown_licenses {
901            // Split weak from good - Python: index.py:1083
902            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
903
904            // Unknown detection on uncovered regions - Python: index.py:1093-1114
905            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
906            let filtered_unknown =
907                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
908
909            let mut all_matches = good_matches;
910            all_matches.extend(filtered_unknown);
911            // reinject weak matches and let refine matches keep the bests
912            // Python: index.py:1117-1118
913            all_matches.extend(weak_matches);
914            all_matches
915        } else {
916            merged_matches
917        };
918
919        // Step 5: Final refine WITH false positive filtering - Python: index.py:1130-1145
920        ensure_within_deadline(deadline)?;
921        let refined = refine_matches(&self.index, refined_matches, &query);
922
923        let mut sorted = refined;
924        sort_matches_by_line(&mut sorted);
925
926        let groups = split_groups_across_frontmatter_boundary(
927            group_matches_by_region(&sorted),
928            Some(content),
929        );
930
931        let detections: Vec<LicenseDetection> = groups
932            .iter()
933            .map(|group| {
934                let mut detection = empty_detection();
935                populate_detection_from_group_with_spdx(
936                    &mut detection,
937                    group,
938                    &self.spdx_mapping,
939                    Some(content),
940                );
941                detection
942            })
943            .collect();
944
945        let detections = post_process_detections(detections, min_score);
946
947        ensure_within_deadline(deadline)?;
948        Ok(detections)
949    }
950
951    pub fn detect_with_kind_and_source(
952        &self,
953        text: &str,
954        unknown_licenses: bool,
955        binary_derived: bool,
956        source_path: &str,
957    ) -> Result<Vec<LicenseDetection>> {
958        self.detect_with_kind_and_source_with_deadline(
959            text,
960            unknown_licenses,
961            binary_derived,
962            source_path,
963            None,
964        )
965    }
966
967    pub(crate) fn detect_with_kind_and_source_with_deadline(
968        &self,
969        text: &str,
970        unknown_licenses: bool,
971        binary_derived: bool,
972        source_path: &str,
973        deadline: Option<Instant>,
974    ) -> Result<Vec<LicenseDetection>> {
975        let mut detections = self.detect_with_kind_with_score_and_deadline(
976            text,
977            unknown_licenses,
978            binary_derived,
979            0.0,
980            deadline,
981        )?;
982        attach_source_path_to_detections(&mut detections, source_path);
983        Ok(detections)
984    }
985
986    pub fn detect_with_kind_and_source_with_score(
987        &self,
988        text: &str,
989        unknown_licenses: bool,
990        binary_derived: bool,
991        source_path: &str,
992        min_score: f32,
993    ) -> Result<Vec<LicenseDetection>> {
994        let mut detections =
995            self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, min_score)?;
996        attach_source_path_to_detections(&mut detections, source_path);
997        Ok(detections)
998    }
999
1000    pub(crate) fn detect_with_kind_and_source_with_score_and_deadline(
1001        &self,
1002        text: &str,
1003        unknown_licenses: bool,
1004        binary_derived: bool,
1005        source_path: &str,
1006        min_score: f32,
1007        deadline: Option<Instant>,
1008    ) -> Result<Vec<LicenseDetection>> {
1009        let mut detections = self.detect_with_kind_with_score_and_deadline(
1010            text,
1011            unknown_licenses,
1012            binary_derived,
1013            min_score,
1014            deadline,
1015        )?;
1016        attach_source_path_to_detections(&mut detections, source_path);
1017        Ok(detections)
1018    }
1019
1020    /// Detect licenses and return raw matches (like Python's idx.match()).
1021    ///
1022    /// This is primarily used by golden tests and maintenance tooling that need
1023    /// raw match sequences before grouping or post-processing into detections.
1024    #[cfg(any(test, feature = "golden-tests"))]
1025    pub fn detect_matches_with_kind(
1026        &self,
1027        text: &str,
1028        unknown_licenses: bool,
1029        binary_derived: bool,
1030    ) -> Result<Vec<LicenseMatch>> {
1031        let clean_text = strip_utf8_bom_str(text);
1032
1033        let content = truncate_detection_text(clean_text);
1034
1035        let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
1036        let whole_query_run = query.whole_query_run();
1037
1038        let mut all_matches = Vec::new();
1039        let mut candidate_contained_matches = Vec::new();
1040        let mut aho_extra_matchables = PositionSet::new();
1041        let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
1042
1043        // Phase 1a: Hash matching
1044        {
1045            let hash_matches = hash_match(&self.index, &whole_query_run);
1046
1047            if !hash_matches.is_empty() {
1048                let mut matches = hash_matches;
1049                sort_matches_by_line(&mut matches);
1050                return Ok(matches);
1051            }
1052        }
1053
1054        // Phase 1b: SPDX-LID matching
1055        {
1056            let spdx_matches = spdx_lid_match(&self.index, &query);
1057            subtract_spdx_match_qspans(
1058                &mut query,
1059                &mut matched_qspans,
1060                &mut aho_extra_matchables,
1061                &spdx_matches,
1062            );
1063            all_matches.extend(spdx_matches);
1064        }
1065
1066        // Phase 1c: Aho-Corasick matching
1067        {
1068            let aho_matches = if aho_extra_matchables.is_empty() {
1069                aho_match(&self.index, &whole_query_run)
1070            } else {
1071                aho_match::aho_match_with_extra_matchables(
1072                    &self.index,
1073                    &whole_query_run,
1074                    Some(&aho_extra_matchables),
1075                    None,
1076                )?
1077            };
1078            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
1079            candidate_contained_matches.extend(refined_aho.clone());
1080            let (merged_aho, _) = merge_and_prepare_aho_matches(
1081                &self.index,
1082                &mut query,
1083                &mut matched_qspans,
1084                &refined_aho,
1085            );
1086            all_matches.extend(merged_aho);
1087
1088            let whole_query_followup = collect_whole_query_exact_followup_matches(
1089                &self.index,
1090                &mut query,
1091                &mut matched_qspans,
1092                &whole_query_run,
1093                None,
1094            )?;
1095            all_matches.extend(whole_query_followup);
1096
1097            let merged_seq = collect_regular_seq_matches(
1098                &self.index,
1099                &query,
1100                &matched_qspans,
1101                &candidate_contained_matches,
1102                None,
1103            )?;
1104            all_matches.extend(merged_seq);
1105        }
1106
1107        // Step 1: Initial refine WITHOUT false positive filtering
1108        let merged_matches =
1109            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
1110
1111        // Step 2: Unknown detection and weak match handling
1112        let refined_matches = if unknown_licenses {
1113            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
1114            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
1115            let filtered_unknown =
1116                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
1117
1118            let mut all_matches = good_matches;
1119            all_matches.extend(filtered_unknown);
1120            all_matches.extend(weak_matches);
1121            all_matches
1122        } else {
1123            merged_matches
1124        };
1125
1126        // Step 3: Final refine WITH false positive filtering - Python: index.py:1130-1145
1127        let refined = refine_matches(&self.index, refined_matches, &query);
1128
1129        let mut sorted = refined;
1130        sort_matches_by_line(&mut sorted);
1131
1132        // Return raw matches (NOT grouped) - this is Python's idx.match() behavior
1133        Ok(sorted)
1134    }
1135
1136    /// Get a reference to the license index.
1137    pub fn index(&self) -> &index::LicenseIndex {
1138        &self.index
1139    }
1140
1141    pub fn spdx_license_list_version(&self) -> Option<&str> {
1142        self.spdx_license_list_version.as_deref()
1143    }
1144
1145    pub fn license_index_provenance(&self) -> Option<&LicenseIndexProvenance> {
1146        self.license_index_provenance.as_ref()
1147    }
1148
1149    /// Get a reference to the SPDX mapping.
1150    #[cfg(test)]
1151    pub fn spdx_mapping(&self) -> &SpdxMapping {
1152        &self.spdx_mapping
1153    }
1154}
1155
1156pub fn detect_scancode_spdx_license_list_version(search_path: &Path) -> Result<Option<String>> {
1157    for ancestor in search_path.ancestors() {
1158        let candidate = ancestor.join("scancode_config.py");
1159        if candidate.is_file() {
1160            let config = fs::read_to_string(&candidate)?;
1161            return Ok(parse_scancode_spdx_license_list_version(&config));
1162        }
1163    }
1164
1165    Ok(None)
1166}
1167
1168fn parse_scancode_spdx_license_list_version(config: &str) -> Option<String> {
1169    config.lines().find_map(|line| {
1170        let trimmed = line.trim();
1171        let (_, value) = trimmed.split_once('=')?;
1172        (trimmed.starts_with("spdx_license_list_version")).then(|| {
1173            value
1174                .trim()
1175                .trim_matches('"')
1176                .trim_matches('\'')
1177                .to_string()
1178        })
1179    })
1180}
1181
1182#[cfg(test)]
1183mod tests;