Skip to main content

provenant/license_detection/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! License Detection Engine
5
6pub mod aho_match;
7pub mod automaton;
8pub mod build_policy;
9pub mod dataset;
10pub(crate) mod detection;
11pub mod embedded;
12pub mod license_cache;
13mod position_set;
14mod token_multiset;
15mod token_set;
16
17#[cfg(test)]
18mod embedded_test;
19pub mod expression;
20#[cfg(feature = "golden-tests")]
21pub mod golden_utils;
22pub mod hash_match;
23pub mod index;
24mod match_refine;
25pub mod models;
26pub mod query;
27pub mod rules;
28pub mod seq_match;
29pub mod spdx_lid;
30pub mod spdx_mapping;
31#[cfg(test)]
32mod test_utils;
33pub mod tokenize;
34pub mod unknown_match;
35
36use bit_set::BitSet;
37use std::collections::HashSet;
38use std::fs;
39use std::path::Path;
40use std::sync::Arc;
41use std::time::Instant;
42
43use anyhow::Result;
44
45use crate::license_detection::build_policy::EMBEDDED_LICENSE_INDEX_SOURCE;
46use crate::license_detection::dataset::{
47    CUSTOM_LICENSE_DATASET_SOURCE, LoadedLicenseDataset, compute_dataset_fingerprint_string,
48    load_license_dataset_from_root,
49};
50use crate::license_detection::embedded::index::{
51    load_embedded_artifact_metadata_from_bytes, load_loader_snapshot_from_bytes,
52};
53use crate::license_detection::index::build_index_from_loaded;
54use crate::license_detection::license_cache::{
55    LicenseCacheConfig, LicenseCacheNamespace, cache_file_size, compute_artifact_fingerprint,
56    compute_rules_fingerprint, delete_cache, load_cached_index, save_cached_index,
57};
58use crate::license_detection::query::Query;
59use crate::license_detection::spdx_mapping::{SpdxMapping, build_spdx_mapping};
60use crate::models::LicenseIndexProvenance;
61use crate::utils::text::strip_utf8_bom_str;
62
63use crate::license_detection::detection::{
64    attach_source_path_to_detections, empty_detection, populate_detection_from_group_with_spdx,
65    split_groups_across_frontmatter_boundary,
66};
67use crate::license_detection::models::MatcherKind;
68
69/// Path to the license rules directory in the reference scancode-toolkit submodule.
70/// Used by test code and the xtask generate-license-loader-artifact binary.
71#[allow(dead_code)]
72pub const SCANCODE_LICENSES_RULES_PATH: &str =
73    "reference/scancode-toolkit/src/licensedcode/data/rules";
74
75/// Path to the licenses directory in the reference scancode-toolkit submodule.
76/// Used by test code and the xtask generate-license-loader-artifact binary.
77#[allow(dead_code)]
78pub const SCANCODE_LICENSES_LICENSES_PATH: &str =
79    "reference/scancode-toolkit/src/licensedcode/data/licenses";
80
81/// Path to the license data directory in the reference scancode-toolkit submodule.
82/// Used by test code and the xtask generate-license-loader-artifact binary.
83#[allow(dead_code)]
84pub const SCANCODE_LICENSES_DATA_PATH: &str = "reference/scancode-toolkit/src/licensedcode/data";
85
86pub const DEFAULT_LICENSEDB_URL_TEMPLATE: &str = "https://scancode-licensedb.aboutcode.org/{}";
87pub(crate) const LICENSE_DETECTION_TIMEOUT_MESSAGE: &str = "license detection timed out";
88
89pub(crate) use detection::{
90    LicenseDetection, group_matches_by_region, post_process_detections, sort_matches_by_line,
91};
92pub use models::LicenseMatch;
93
94pub use aho_match::aho_match;
95pub use hash_match::hash_match;
96pub use match_refine::{
97    filter_invalid_contained_unknown_matches, merge_overlapping_matches, refine_matches,
98    refine_matches_without_false_positive_filter, split_weak_matches,
99};
100pub use position_set::PositionSet;
101pub use spdx_lid::spdx_lid_match;
102pub use token_multiset::TokenMultiset;
103pub use token_set::TokenSet;
104pub use unknown_match::unknown_match;
105
106use self::seq_match::{
107    MAX_NEAR_DUPE_CANDIDATES, select_seq_candidates_with_deadline,
108    seq_match_with_candidates_and_deadline,
109};
110
111/// License detection engine that orchestrates the detection pipeline.
112///
113/// The engine loads license rules and builds an index for efficient matching.
114/// It supports multiple matching strategies (hash, SPDX-LID, Aho-Corasick, sequence)
115/// and combines their results into final license detections.
116#[derive(Debug, Clone)]
117pub struct LicenseDetectionEngine {
118    index: Arc<index::LicenseIndex>,
119    spdx_mapping: SpdxMapping,
120    spdx_license_list_version: Option<String>,
121    license_index_provenance: Option<LicenseIndexProvenance>,
122}
123
124const MAX_DETECTION_SIZE: usize = 10 * 1024 * 1024; // 10MB
125const MAX_REGULAR_SEQ_CANDIDATES: usize = 70;
126const MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP: usize = 8;
127const MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP: usize = 2;
128
129pub(crate) fn deadline_exceeded(deadline: Option<Instant>) -> bool {
130    deadline.is_some_and(|deadline| Instant::now() >= deadline)
131}
132
133pub(crate) fn ensure_within_deadline(deadline: Option<Instant>) -> Result<()> {
134    if deadline_exceeded(deadline) {
135        Err(anyhow::anyhow!(LICENSE_DETECTION_TIMEOUT_MESSAGE))
136    } else {
137        Ok(())
138    }
139}
140
141fn truncate_detection_text(clean_text: &str) -> &str {
142    if clean_text.len() <= MAX_DETECTION_SIZE {
143        return clean_text;
144    }
145
146    log::debug!(
147        "Content size {} exceeds limit {}, truncating for detection",
148        clean_text.len(),
149        MAX_DETECTION_SIZE
150    );
151
152    let boundary = clean_text.floor_char_boundary(MAX_DETECTION_SIZE);
153    &clean_text[..boundary]
154}
155
156fn query_span_for_match(m: &LicenseMatch) -> Option<models::PositionSpan> {
157    (!m.query_span().is_empty()).then(|| m.query_span().clone())
158}
159
160fn has_full_match_coverage(m: &LicenseMatch) -> bool {
161    m.coverage() == 100.0
162}
163
164fn is_redundant_same_expression_seq_container(
165    container: &LicenseMatch,
166    candidate_contained_matches: &[LicenseMatch],
167) -> bool {
168    let container_is_redundant_coverage =
169        has_full_match_coverage(container) || container.coverage() >= 99.0;
170    if container.matcher != MatcherKind::Seq || !container_is_redundant_coverage {
171        return false;
172    }
173
174    let container_qspan_set = container.qspan_set();
175
176    let mut contained: Vec<&LicenseMatch> = candidate_contained_matches
177        .iter()
178        .filter(|m| {
179            m.matcher == MatcherKind::Aho
180                && has_full_match_coverage(m)
181                && m.license_expression == container.license_expression
182                && m.overlaps_with(&container_qspan_set)
183        })
184        .collect();
185
186    if contained.len() < 2 {
187        return false;
188    }
189
190    let material_children = contained.iter().filter(|m| m.matched_length > 1).count();
191    if material_children < 2 {
192        return false;
193    }
194
195    contained.sort_by_key(|m| m.qspan_bounds());
196
197    let mut child_union = PositionSet::new();
198    for m in &contained {
199        child_union.extend_from_span(m.query_span());
200    }
201
202    let container_only_positions = container_qspan_set.difference(&child_union);
203    let child_only_positions = child_union.difference(&container_qspan_set);
204
205    let mut bridge_positions = BitSet::new();
206    for pair in contained.windows(2) {
207        let (_, previous_end) = pair[0].qspan_bounds();
208        let (next_start, _) = pair[1].qspan_bounds();
209
210        if next_start < previous_end {
211            return false;
212        }
213
214        for pos in previous_end..next_start {
215            bridge_positions.insert(pos);
216        }
217    }
218
219    let container_only_boundary_positions = container_only_positions
220        .iter()
221        .filter(|&pos| !bridge_positions.contains(pos))
222        .count();
223
224    if container_only_positions.len() == 1
225        && container_only_boundary_positions == 0
226        && child_only_positions.is_empty()
227    {
228        return false;
229    }
230
231    if child_only_positions.is_empty()
232        && container_only_positions.len() == container_only_boundary_positions
233        && container_only_boundary_positions <= 3
234    {
235        let earliest_child = contained
236            .iter()
237            .map(|m| m.qspan_bounds().0)
238            .min()
239            .unwrap_or(usize::MAX);
240        let latest_child = contained
241            .iter()
242            .map(|m| m.qspan_bounds().1.saturating_sub(1))
243            .max()
244            .unwrap_or(0);
245
246        let is_one_sided_boundary = container_only_positions
247            .iter()
248            .all(|pos| pos < earliest_child)
249            || container_only_positions
250                .iter()
251                .all(|pos| pos > latest_child);
252
253        if is_one_sided_boundary {
254            return false;
255        }
256    }
257
258    let max_container_only_positions =
259        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * contained.len() + 1;
260    let max_container_boundary_positions =
261        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * (contained.len() - 1);
262    let max_child_only_positions = MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP + 1;
263
264    container_only_positions.len() <= max_container_only_positions
265        && container_only_boundary_positions <= max_container_boundary_positions
266        && child_only_positions.len() <= max_child_only_positions
267}
268
269fn filter_redundant_same_expression_seq_containers(
270    seq_matches: Vec<LicenseMatch>,
271    candidate_contained_matches: &[LicenseMatch],
272) -> Vec<LicenseMatch> {
273    seq_matches
274        .into_iter()
275        .filter(|m| !is_redundant_same_expression_seq_container(m, candidate_contained_matches))
276        .collect()
277}
278
279fn is_redundant_low_coverage_composite_seq_wrapper(
280    container: &LicenseMatch,
281    candidate_contained_matches: &[LicenseMatch],
282) -> bool {
283    if container.matcher != seq_match::MATCH_SEQ || container.coverage() >= 30.0 {
284        return false;
285    }
286
287    let container_qspan_set = container.qspan_set();
288
289    let children: Vec<&LicenseMatch> = candidate_contained_matches
290        .iter()
291        .filter(|m| {
292            m.matcher == aho_match::MATCH_AHO
293                && has_full_match_coverage(m)
294                && m.license_expression != container.license_expression
295                && m.overlaps_with(&container_qspan_set)
296        })
297        .collect();
298
299    if children.len() < 2 {
300        return false;
301    }
302
303    let unique_expressions: HashSet<&str> = children
304        .iter()
305        .map(|m| m.license_expression.as_str())
306        .collect();
307    if unique_expressions.len() < 2 {
308        return false;
309    }
310
311    let mut child_union = PositionSet::new();
312    for m in &children {
313        child_union.extend_from_span(m.query_span());
314    }
315
316    let container_only_positions = container_qspan_set.difference(&child_union);
317    let child_only_positions = child_union.difference(&container_qspan_set);
318
319    let mut sorted_children = children;
320    sorted_children.sort_by_key(|m| m.qspan_bounds());
321
322    let mut bridge_positions = BitSet::new();
323    for pair in sorted_children.windows(2) {
324        let (_, previous_end) = pair[0].qspan_bounds();
325        let (next_start, _) = pair[1].qspan_bounds();
326        for pos in previous_end..next_start {
327            bridge_positions.insert(pos);
328        }
329    }
330
331    let container_only_boundary_positions = container_only_positions
332        .iter()
333        .filter(|&pos| !bridge_positions.contains(pos))
334        .count();
335
336    child_only_positions.is_empty()
337        && container_only_positions.len() <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
338        && container_only_boundary_positions <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
339}
340
341fn filter_redundant_low_coverage_composite_seq_wrappers(
342    seq_matches: Vec<LicenseMatch>,
343    candidate_contained_matches: &[LicenseMatch],
344) -> Vec<LicenseMatch> {
345    seq_matches
346        .into_iter()
347        .filter(|m| {
348            !is_redundant_low_coverage_composite_seq_wrapper(m, candidate_contained_matches)
349        })
350        .collect()
351}
352
353fn subtract_spdx_match_qspans(
354    query: &mut Query<'_>,
355    matched_qspans: &mut Vec<models::PositionSpan>,
356    aho_extra_matchables: &mut PositionSet,
357    spdx_matches: &[LicenseMatch],
358) {
359    for m in spdx_matches {
360        let Some(span) = query_span_for_match(m) else {
361            continue;
362        };
363
364        aho_extra_matchables.extend_from_span(&span);
365        query.subtract(&span);
366
367        if has_full_match_coverage(m) {
368            matched_qspans.push(span);
369        }
370    }
371}
372
373fn merge_and_prepare_aho_matches(
374    index: &index::LicenseIndex,
375    query: &mut Query<'_>,
376    matched_qspans: &mut Vec<models::PositionSpan>,
377    refined_aho: &[LicenseMatch],
378) -> (Vec<LicenseMatch>, bool) {
379    let merged_aho = merge_overlapping_matches(refined_aho);
380    let mut saw_long_exact_license_text_match = false;
381
382    for m in &merged_aho {
383        let Some(span) = query_span_for_match(m) else {
384            continue;
385        };
386
387        if has_full_match_coverage(m) {
388            matched_qspans.push(span.clone());
389        }
390
391        if index
392            .rules_by_rid
393            .get(m.rid)
394            .is_some_and(|rule| rule.is_license_text())
395            && m.rule_length > 120
396            && m.coverage() > 98.0
397        {
398            query.subtract(&span);
399            saw_long_exact_license_text_match = true;
400        }
401    }
402
403    (merged_aho, saw_long_exact_license_text_match)
404}
405
406fn collect_whole_query_exact_followup_matches(
407    index: &index::LicenseIndex,
408    query: &mut Query<'_>,
409    matched_qspans: &mut Vec<models::PositionSpan>,
410    whole_run: &query::QueryRun<'_>,
411    deadline: Option<Instant>,
412) -> Result<Vec<LicenseMatch>> {
413    let mut seq_all_matches = Vec::new();
414
415    if whole_run.is_matchable(false, matched_qspans) {
416        let near_dupe_candidates = if deadline.is_some() {
417            select_seq_candidates_with_deadline(
418                index,
419                whole_run,
420                true,
421                MAX_NEAR_DUPE_CANDIDATES,
422                deadline,
423            )?
424        } else {
425            self::seq_match::select_seq_candidates(index, whole_run, true, MAX_NEAR_DUPE_CANDIDATES)
426        };
427
428        if !near_dupe_candidates.is_empty() {
429            let near_dupe_matches = if deadline.is_some() {
430                seq_match_with_candidates_and_deadline(
431                    index,
432                    whole_run,
433                    &near_dupe_candidates,
434                    deadline,
435                )?
436            } else {
437                self::seq_match::seq_match_with_candidates(index, whole_run, &near_dupe_candidates)
438            };
439
440            for m in &near_dupe_matches {
441                if !m.query_span().is_empty() {
442                    let span = m.query_span().clone();
443                    query.subtract(&span);
444                    matched_qspans.push(span);
445                }
446            }
447
448            seq_all_matches.extend(near_dupe_matches);
449        }
450    }
451
452    Ok(seq_all_matches)
453}
454
455fn collect_regular_seq_matches(
456    index: &index::LicenseIndex,
457    query: &Query<'_>,
458    matched_qspans: &[models::PositionSpan],
459    candidate_contained_matches: &[LicenseMatch],
460    deadline: Option<Instant>,
461) -> Result<Vec<LicenseMatch>> {
462    let mut seq_all_matches = Vec::new();
463
464    for (query_run_index, query_run) in query.query_runs().into_iter().enumerate() {
465        if query_run_index % 8 == 0 {
466            ensure_within_deadline(deadline)?;
467        }
468
469        if !query_run.is_matchable(false, matched_qspans) {
470            continue;
471        }
472
473        let candidates = if deadline.is_some() {
474            select_seq_candidates_with_deadline(
475                index,
476                &query_run,
477                false,
478                MAX_REGULAR_SEQ_CANDIDATES,
479                deadline,
480            )?
481        } else {
482            self::seq_match::select_seq_candidates(
483                index,
484                &query_run,
485                false,
486                MAX_REGULAR_SEQ_CANDIDATES,
487            )
488        };
489        if !candidates.is_empty() {
490            let matches = if deadline.is_some() {
491                seq_match_with_candidates_and_deadline(index, &query_run, &candidates, deadline)?
492            } else {
493                self::seq_match::seq_match_with_candidates(index, &query_run, &candidates)
494            };
495            seq_all_matches.extend(matches);
496        }
497    }
498
499    let merged_seq = merge_overlapping_matches(&seq_all_matches);
500    let filtered_same_expression =
501        filter_redundant_same_expression_seq_containers(merged_seq, candidate_contained_matches);
502    Ok(filter_redundant_low_coverage_composite_seq_wrappers(
503        filtered_same_expression,
504        candidate_contained_matches,
505    ))
506}
507
508impl LicenseDetectionEngine {
509    /// Create a new license detection engine from a pre-built license index.
510    ///
511    /// This is an internal constructor used by `from_directory()` and `from_embedded()`.
512    /// It builds the SPDX mapping from the licenses in the index.
513    fn from_index(
514        index: index::LicenseIndex,
515        spdx_license_list_version: Option<String>,
516        license_index_provenance: Option<LicenseIndexProvenance>,
517    ) -> Result<Self> {
518        let mut license_vec: Vec<_> = index.licenses_by_key.values().cloned().collect();
519        license_vec.sort_by(|a, b| a.key.cmp(&b.key));
520        let spdx_mapping = build_spdx_mapping(&license_vec);
521
522        Ok(Self {
523            index: Arc::new(index),
524            spdx_mapping,
525            spdx_license_list_version,
526            license_index_provenance,
527        })
528    }
529
530    #[cfg(test)]
531    pub(crate) fn from_test_index(index: index::LicenseIndex) -> Self {
532        Self::from_index(index, None, None).expect("test index should build license engine")
533    }
534
535    /// Create a new license detection engine from the embedded license index.
536    ///
537    /// Convenience method that uses the default Provenant cache root and does
538    /// not force a reindex.
539    pub fn from_embedded() -> Result<Self> {
540        let cache_config =
541            LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
542        Self::from_embedded_with_cache(&cache_config)
543    }
544
545    /// Create a new license detection engine from the embedded license index.
546    ///
547    /// This method loads the build-time embedded license artifact and constructs
548    /// the runtime license index. This eliminates the runtime dependency on the
549    /// ScanCode rules directory.
550    ///
551    /// If a valid cache exists (matching fingerprint), the index is loaded from
552    /// the rkyv cache file instead of being rebuilt from scratch.
553    ///
554    /// # Arguments
555    /// * `cache_config` - Cache configuration (directory and reindex flag)
556    ///
557    /// # Returns
558    /// A Result containing the engine or an error
559    pub fn from_embedded_with_cache(cache_config: &LicenseCacheConfig) -> Result<Self> {
560        let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
561        let fingerprint = compute_artifact_fingerprint(artifact_bytes);
562        let artifact_metadata = load_embedded_artifact_metadata_from_bytes(artifact_bytes)
563            .map_err(|e| {
564                anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
565            })?;
566        debug_assert_eq!(
567            artifact_metadata.license_index_provenance.source,
568            EMBEDDED_LICENSE_INDEX_SOURCE
569        );
570        let spdx_version = Some(artifact_metadata.spdx_license_list_version.clone());
571        let provenance = Some(artifact_metadata.license_index_provenance.clone());
572
573        if !cache_config.reindex {
574            if let Some(cached) =
575                load_cached_index(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?
576            {
577                let start = Instant::now();
578                eprintln!(
579                    "License index loaded from rkyv cache in {:.2}s",
580                    start.elapsed().as_secs_f64()
581                );
582                return Self::from_index(cached, spdx_version, provenance);
583            }
584        } else {
585            delete_cache(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?;
586        }
587
588        let snapshot = load_loader_snapshot_from_bytes(artifact_bytes)
589            .map_err(|e| anyhow::anyhow!("Failed to load embedded license index: {}", e))?;
590        let spdx_version = Some(snapshot.metadata.spdx_license_list_version.clone());
591        let provenance = Some(snapshot.metadata.license_index_provenance.clone());
592
593        let start = Instant::now();
594        let index = build_index_from_loaded(snapshot.rules, snapshot.licenses, false);
595        eprintln!(
596            "License index built from embedded artifact in {:.2}s",
597            start.elapsed().as_secs_f64()
598        );
599
600        let mut index = index;
601        index.spdx_license_list_version = spdx_version.clone();
602        if let Err(e) = save_cached_index(
603            cache_config,
604            LicenseCacheNamespace::Embedded,
605            &index,
606            &fingerprint,
607        ) {
608            eprintln!("Warning: failed to save license index cache: {}", e);
609        } else if let Some(size) =
610            cache_file_size(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)
611        {
612            eprintln!(
613                "License index cache saved ({:.1} MB)",
614                size as f64 / 1_048_576.0
615            );
616        }
617
618        Self::from_index(index, spdx_version, provenance)
619    }
620
621    /// Create a new license detection engine from a license dataset root.
622    ///
623    /// Convenience method that uses the default Provenant cache root and does
624    /// not force a reindex.
625    pub fn from_directory(rules_path: &Path) -> Result<Self> {
626        let cache_config =
627            LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
628        Self::from_directory_with_cache(rules_path, &cache_config)
629    }
630
631    /// Create a new license detection engine from a directory of license rules.
632    ///
633    /// If a valid cache exists (matching fingerprint of the dataset), the index is
634    /// loaded from the rkyv cache file instead of being rebuilt from scratch.
635    ///
636    /// # Arguments
637    /// * `rules_path` - Path to dataset root containing rules/ and licenses/
638    /// * `cache_config` - Cache configuration (directory and reindex flag)
639    ///
640    /// # Returns
641    /// A Result containing the engine or an error
642    pub fn from_directory_with_cache(
643        rules_path: &Path,
644        cache_config: &LicenseCacheConfig,
645    ) -> Result<Self> {
646        let LoadedLicenseDataset {
647            manifest,
648            rules: loaded_rules,
649            licenses: loaded_licenses,
650        } = load_license_dataset_from_root(rules_path)?;
651
652        let fingerprint = compute_rules_fingerprint(&loaded_rules, &loaded_licenses)?;
653        let provenance = Some(LicenseIndexProvenance {
654            source: CUSTOM_LICENSE_DATASET_SOURCE.to_string(),
655            dataset_fingerprint: compute_dataset_fingerprint_string(
656                &loaded_rules,
657                &loaded_licenses,
658            )?,
659            ignored_rules: vec![],
660            ignored_licenses: vec![],
661            ignored_rules_due_to_licenses: vec![],
662            added_rules: vec![],
663            replaced_rules: vec![],
664            added_licenses: vec![],
665            replaced_licenses: vec![],
666        });
667
668        if !cache_config.reindex {
669            if let Some(cached) = load_cached_index(
670                cache_config,
671                LicenseCacheNamespace::CustomRules,
672                &fingerprint,
673            )? {
674                let start = Instant::now();
675                eprintln!(
676                    "License index loaded from rkyv cache in {:.2}s",
677                    start.elapsed().as_secs_f64()
678                );
679                return Self::from_index(
680                    cached,
681                    Some(manifest.spdx_license_list_version),
682                    provenance,
683                );
684            }
685        } else {
686            delete_cache(
687                cache_config,
688                LicenseCacheNamespace::CustomRules,
689                &fingerprint,
690            )?;
691        }
692
693        let start = Instant::now();
694        let index = build_index_from_loaded(loaded_rules, loaded_licenses, false);
695        eprintln!(
696            "License index built from custom dataset in {:.2}s",
697            start.elapsed().as_secs_f64()
698        );
699
700        if let Err(e) = save_cached_index(
701            cache_config,
702            LicenseCacheNamespace::CustomRules,
703            &index,
704            &fingerprint,
705        ) {
706            eprintln!("Warning: failed to save license index cache: {}", e);
707        } else if let Some(size) = cache_file_size(
708            cache_config,
709            LicenseCacheNamespace::CustomRules,
710            &fingerprint,
711        ) {
712            eprintln!(
713                "License index cache saved ({:.1} MB)",
714                size as f64 / 1_048_576.0
715            );
716        }
717
718        Self::from_index(index, Some(manifest.spdx_license_list_version), provenance)
719    }
720
721    pub fn embedded_spdx_license_list_version() -> Result<String> {
722        let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
723        Ok(load_embedded_artifact_metadata_from_bytes(artifact_bytes)
724            .map_err(|e| {
725                anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
726            })?
727            .spdx_license_list_version)
728    }
729
730    pub fn detect_with_kind(
731        &self,
732        text: &str,
733        unknown_licenses: bool,
734        binary_derived: bool,
735    ) -> Result<Vec<LicenseDetection>> {
736        self.detect_with_kind_with_score_and_deadline(
737            text,
738            unknown_licenses,
739            binary_derived,
740            0.0,
741            None,
742        )
743    }
744
745    pub fn detect_with_kind_with_score(
746        &self,
747        text: &str,
748        unknown_licenses: bool,
749        binary_derived: bool,
750        min_score: f32,
751    ) -> Result<Vec<LicenseDetection>> {
752        self.detect_with_kind_with_score_and_deadline(
753            text,
754            unknown_licenses,
755            binary_derived,
756            min_score,
757            None,
758        )
759    }
760
761    pub(crate) fn detect_with_kind_with_score_and_deadline(
762        &self,
763        text: &str,
764        unknown_licenses: bool,
765        binary_derived: bool,
766        min_score: f32,
767        deadline: Option<Instant>,
768    ) -> Result<Vec<LicenseDetection>> {
769        ensure_within_deadline(deadline)?;
770        let clean_text = strip_utf8_bom_str(text);
771
772        let content = truncate_detection_text(clean_text);
773
774        ensure_within_deadline(deadline)?;
775        let mut query = if deadline.is_some() {
776            Query::from_extracted_text_with_deadline(
777                content,
778                &self.index,
779                binary_derived,
780                deadline,
781            )?
782        } else {
783            Query::from_extracted_text(content, &self.index, binary_derived)?
784        };
785        let whole_query_run = query.whole_query_run();
786
787        let mut all_matches = Vec::new();
788        let mut candidate_contained_matches = Vec::new();
789        let mut aho_extra_matchables = PositionSet::new();
790        let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
791
792        // Phase 1a: Hash matching
793        // Python returns immediately if hash matches found (index.py:987-991)
794        {
795            ensure_within_deadline(deadline)?;
796            let hash_matches = hash_match(&self.index, &whole_query_run);
797
798            if !hash_matches.is_empty() {
799                let mut matches = hash_matches;
800                sort_matches_by_line(&mut matches);
801
802                let groups = split_groups_across_frontmatter_boundary(
803                    group_matches_by_region(&matches),
804                    Some(content),
805                );
806                let detections: Vec<LicenseDetection> = groups
807                    .iter()
808                    .map(|group| {
809                        let mut detection = empty_detection();
810                        populate_detection_from_group_with_spdx(
811                            &mut detection,
812                            group,
813                            &self.spdx_mapping,
814                            Some(content),
815                        );
816                        detection
817                    })
818                    .collect();
819
820                return Ok(post_process_detections(detections, min_score));
821            }
822        }
823
824        // Phase 1b: SPDX-LID matching
825        {
826            ensure_within_deadline(deadline)?;
827            let spdx_matches = spdx_lid_match(&self.index, &query);
828            subtract_spdx_match_qspans(
829                &mut query,
830                &mut matched_qspans,
831                &mut aho_extra_matchables,
832                &spdx_matches,
833            );
834            all_matches.extend(spdx_matches);
835        }
836
837        // Phase 1c: Aho-Corasick matching
838        {
839            ensure_within_deadline(deadline)?;
840            let aho_matches = if aho_extra_matchables.is_empty() {
841                if deadline.is_some() {
842                    aho_match::aho_match_with_deadline(&self.index, &whole_query_run, deadline)?
843                } else {
844                    aho_match(&self.index, &whole_query_run)
845                }
846            } else {
847                if deadline.is_some() {
848                    aho_match::aho_match_with_extra_matchables(
849                        &self.index,
850                        &whole_query_run,
851                        Some(&aho_extra_matchables),
852                        deadline,
853                    )?
854                } else {
855                    aho_match::aho_match_with_extra_matchables(
856                        &self.index,
857                        &whole_query_run,
858                        Some(&aho_extra_matchables),
859                        None,
860                    )?
861                }
862            };
863
864            // Python's get_exact_matches() calls refine_matches with merge=False
865            // This applies quality filters including required phrase filtering
866            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
867            candidate_contained_matches.extend(refined_aho.clone());
868            let (merged_aho, _) = merge_and_prepare_aho_matches(
869                &self.index,
870                &mut query,
871                &mut matched_qspans,
872                &refined_aho,
873            );
874            all_matches.extend(merged_aho);
875
876            let whole_query_followup = collect_whole_query_exact_followup_matches(
877                &self.index,
878                &mut query,
879                &mut matched_qspans,
880                &whole_query_run,
881                deadline,
882            )?;
883            all_matches.extend(whole_query_followup);
884
885            let merged_seq = collect_regular_seq_matches(
886                &self.index,
887                &query,
888                &matched_qspans,
889                &candidate_contained_matches,
890                deadline,
891            )?;
892            all_matches.extend(merged_seq);
893        }
894
895        // Step 1: Initial refine WITHOUT false positive filtering
896        // Python: refine_matches with filter_false_positive=False (index.py:1073-1080)
897        ensure_within_deadline(deadline)?;
898        let merged_matches =
899            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
900
901        // Step 2: Unknown detection and weak match handling
902        // Python: index.py:1079-1118 - only runs when unknown_licenses=True
903        let refined_matches = if unknown_licenses {
904            // Split weak from good - Python: index.py:1083
905            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
906
907            // Unknown detection on uncovered regions - Python: index.py:1093-1114
908            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
909            let filtered_unknown =
910                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
911
912            let mut all_matches = good_matches;
913            all_matches.extend(filtered_unknown);
914            // reinject weak matches and let refine matches keep the bests
915            // Python: index.py:1117-1118
916            all_matches.extend(weak_matches);
917            all_matches
918        } else {
919            merged_matches
920        };
921
922        // Step 5: Final refine WITH false positive filtering - Python: index.py:1130-1145
923        ensure_within_deadline(deadline)?;
924        let refined = refine_matches(&self.index, refined_matches, &query);
925
926        let mut sorted = refined;
927        sort_matches_by_line(&mut sorted);
928
929        let groups = split_groups_across_frontmatter_boundary(
930            group_matches_by_region(&sorted),
931            Some(content),
932        );
933
934        let detections: Vec<LicenseDetection> = groups
935            .iter()
936            .map(|group| {
937                let mut detection = empty_detection();
938                populate_detection_from_group_with_spdx(
939                    &mut detection,
940                    group,
941                    &self.spdx_mapping,
942                    Some(content),
943                );
944                detection
945            })
946            .collect();
947
948        let detections = post_process_detections(detections, min_score);
949
950        ensure_within_deadline(deadline)?;
951        Ok(detections)
952    }
953
954    pub fn detect_with_kind_and_source(
955        &self,
956        text: &str,
957        unknown_licenses: bool,
958        binary_derived: bool,
959        source_path: &str,
960    ) -> Result<Vec<LicenseDetection>> {
961        self.detect_with_kind_and_source_with_deadline(
962            text,
963            unknown_licenses,
964            binary_derived,
965            source_path,
966            None,
967        )
968    }
969
970    pub(crate) fn detect_with_kind_and_source_with_deadline(
971        &self,
972        text: &str,
973        unknown_licenses: bool,
974        binary_derived: bool,
975        source_path: &str,
976        deadline: Option<Instant>,
977    ) -> Result<Vec<LicenseDetection>> {
978        let mut detections = self.detect_with_kind_with_score_and_deadline(
979            text,
980            unknown_licenses,
981            binary_derived,
982            0.0,
983            deadline,
984        )?;
985        attach_source_path_to_detections(&mut detections, source_path);
986        Ok(detections)
987    }
988
989    pub fn detect_with_kind_and_source_with_score(
990        &self,
991        text: &str,
992        unknown_licenses: bool,
993        binary_derived: bool,
994        source_path: &str,
995        min_score: f32,
996    ) -> Result<Vec<LicenseDetection>> {
997        let mut detections =
998            self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, min_score)?;
999        attach_source_path_to_detections(&mut detections, source_path);
1000        Ok(detections)
1001    }
1002
1003    pub(crate) fn detect_with_kind_and_source_with_score_and_deadline(
1004        &self,
1005        text: &str,
1006        unknown_licenses: bool,
1007        binary_derived: bool,
1008        source_path: &str,
1009        min_score: f32,
1010        deadline: Option<Instant>,
1011    ) -> Result<Vec<LicenseDetection>> {
1012        let mut detections = self.detect_with_kind_with_score_and_deadline(
1013            text,
1014            unknown_licenses,
1015            binary_derived,
1016            min_score,
1017            deadline,
1018        )?;
1019        attach_source_path_to_detections(&mut detections, source_path);
1020        Ok(detections)
1021    }
1022
1023    /// Detect licenses and return raw matches (like Python's idx.match()).
1024    ///
1025    /// This is primarily used by golden tests and maintenance tooling that need
1026    /// raw match sequences before grouping or post-processing into detections.
1027    #[cfg(any(test, feature = "golden-tests"))]
1028    pub fn detect_matches_with_kind(
1029        &self,
1030        text: &str,
1031        unknown_licenses: bool,
1032        binary_derived: bool,
1033    ) -> Result<Vec<LicenseMatch>> {
1034        let clean_text = strip_utf8_bom_str(text);
1035
1036        let content = truncate_detection_text(clean_text);
1037
1038        let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
1039        let whole_query_run = query.whole_query_run();
1040
1041        let mut all_matches = Vec::new();
1042        let mut candidate_contained_matches = Vec::new();
1043        let mut aho_extra_matchables = PositionSet::new();
1044        let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
1045
1046        // Phase 1a: Hash matching
1047        {
1048            let hash_matches = hash_match(&self.index, &whole_query_run);
1049
1050            if !hash_matches.is_empty() {
1051                let mut matches = hash_matches;
1052                sort_matches_by_line(&mut matches);
1053                return Ok(matches);
1054            }
1055        }
1056
1057        // Phase 1b: SPDX-LID matching
1058        {
1059            let spdx_matches = spdx_lid_match(&self.index, &query);
1060            subtract_spdx_match_qspans(
1061                &mut query,
1062                &mut matched_qspans,
1063                &mut aho_extra_matchables,
1064                &spdx_matches,
1065            );
1066            all_matches.extend(spdx_matches);
1067        }
1068
1069        // Phase 1c: Aho-Corasick matching
1070        {
1071            let aho_matches = if aho_extra_matchables.is_empty() {
1072                aho_match(&self.index, &whole_query_run)
1073            } else {
1074                aho_match::aho_match_with_extra_matchables(
1075                    &self.index,
1076                    &whole_query_run,
1077                    Some(&aho_extra_matchables),
1078                    None,
1079                )?
1080            };
1081            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
1082            candidate_contained_matches.extend(refined_aho.clone());
1083            let (merged_aho, _) = merge_and_prepare_aho_matches(
1084                &self.index,
1085                &mut query,
1086                &mut matched_qspans,
1087                &refined_aho,
1088            );
1089            all_matches.extend(merged_aho);
1090
1091            let whole_query_followup = collect_whole_query_exact_followup_matches(
1092                &self.index,
1093                &mut query,
1094                &mut matched_qspans,
1095                &whole_query_run,
1096                None,
1097            )?;
1098            all_matches.extend(whole_query_followup);
1099
1100            let merged_seq = collect_regular_seq_matches(
1101                &self.index,
1102                &query,
1103                &matched_qspans,
1104                &candidate_contained_matches,
1105                None,
1106            )?;
1107            all_matches.extend(merged_seq);
1108        }
1109
1110        // Step 1: Initial refine WITHOUT false positive filtering
1111        let merged_matches =
1112            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
1113
1114        // Step 2: Unknown detection and weak match handling
1115        let refined_matches = if unknown_licenses {
1116            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
1117            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
1118            let filtered_unknown =
1119                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
1120
1121            let mut all_matches = good_matches;
1122            all_matches.extend(filtered_unknown);
1123            all_matches.extend(weak_matches);
1124            all_matches
1125        } else {
1126            merged_matches
1127        };
1128
1129        // Step 3: Final refine WITH false positive filtering - Python: index.py:1130-1145
1130        let refined = refine_matches(&self.index, refined_matches, &query);
1131
1132        let mut sorted = refined;
1133        sort_matches_by_line(&mut sorted);
1134
1135        // Return raw matches (NOT grouped) - this is Python's idx.match() behavior
1136        Ok(sorted)
1137    }
1138
1139    /// Get a reference to the license index.
1140    pub fn index(&self) -> &index::LicenseIndex {
1141        &self.index
1142    }
1143
1144    pub fn spdx_license_list_version(&self) -> Option<&str> {
1145        self.spdx_license_list_version.as_deref()
1146    }
1147
1148    pub fn license_index_provenance(&self) -> Option<&LicenseIndexProvenance> {
1149        self.license_index_provenance.as_ref()
1150    }
1151
1152    /// Get a reference to the SPDX mapping.
1153    #[cfg(test)]
1154    pub fn spdx_mapping(&self) -> &SpdxMapping {
1155        &self.spdx_mapping
1156    }
1157}
1158
1159pub fn detect_scancode_spdx_license_list_version(search_path: &Path) -> Result<Option<String>> {
1160    for ancestor in search_path.ancestors() {
1161        let candidate = ancestor.join("scancode_config.py");
1162        if candidate.is_file() {
1163            let config = fs::read_to_string(&candidate)?;
1164            return Ok(parse_scancode_spdx_license_list_version(&config));
1165        }
1166    }
1167
1168    Ok(None)
1169}
1170
1171fn parse_scancode_spdx_license_list_version(config: &str) -> Option<String> {
1172    config.lines().find_map(|line| {
1173        let trimmed = line.trim();
1174        let (_, value) = trimmed.split_once('=')?;
1175        (trimmed.starts_with("spdx_license_list_version")).then(|| {
1176            value
1177                .trim()
1178                .trim_matches('"')
1179                .trim_matches('\'')
1180                .to_string()
1181        })
1182    })
1183}
1184
1185#[cfg(test)]
1186mod tests;