Skip to main content

provenant/license_detection/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! License Detection Engine
5
6pub mod aho_match;
7pub mod automaton;
8pub mod build_policy;
9pub mod dataset;
10pub(crate) mod detection;
11pub mod embedded;
12pub mod license_cache;
13mod position_set;
14mod token_multiset;
15mod token_set;
16
17#[cfg(test)]
18mod embedded_test;
19pub mod expression;
20#[cfg(feature = "golden-tests")]
21pub mod golden_utils;
22pub mod hash_match;
23pub mod index;
24mod match_refine;
25pub mod models;
26pub mod query;
27pub mod rules;
28pub mod seq_match;
29pub mod spdx_lid;
30pub mod spdx_mapping;
31#[cfg(test)]
32mod test_utils;
33pub mod tokenize;
34pub mod unknown_match;
35
36use bit_set::BitSet;
37use std::collections::HashSet;
38use std::fs;
39use std::path::Path;
40use std::sync::Arc;
41use std::time::Instant;
42
43use anyhow::Result;
44
45use crate::license_detection::build_policy::EMBEDDED_LICENSE_INDEX_SOURCE;
46use crate::license_detection::dataset::{
47    CUSTOM_LICENSE_DATASET_SOURCE, LoadedLicenseDataset, compute_dataset_fingerprint_string,
48    load_license_dataset_from_root,
49};
50use crate::license_detection::embedded::index::{
51    load_embedded_artifact_metadata_from_bytes, load_loader_snapshot_from_bytes,
52};
53use crate::license_detection::index::build_index_from_loaded;
54use crate::license_detection::license_cache::{
55    LicenseCacheConfig, LicenseCacheNamespace, cache_file_size, compute_artifact_fingerprint,
56    compute_rules_fingerprint, delete_cache, load_cached_index, save_cached_index,
57};
58use crate::license_detection::query::Query;
59use crate::license_detection::spdx_mapping::{SpdxMapping, build_spdx_mapping};
60use crate::models::LicenseIndexProvenance;
61use crate::utils::text::strip_utf8_bom_str;
62
63use crate::license_detection::detection::{
64    attach_source_path_to_detections, empty_detection, populate_detection_from_group_with_spdx,
65    split_groups_across_frontmatter_boundary,
66};
67use crate::license_detection::models::MatcherKind;
68
69/// Path to the license rules directory in the reference scancode-toolkit submodule.
70/// Used by test code and the xtask generate-license-loader-artifact binary.
71#[allow(dead_code)]
72pub const SCANCODE_LICENSES_RULES_PATH: &str =
73    "reference/scancode-toolkit/src/licensedcode/data/rules";
74
75/// Path to the licenses directory in the reference scancode-toolkit submodule.
76/// Used by test code and the xtask generate-license-loader-artifact binary.
77#[allow(dead_code)]
78pub const SCANCODE_LICENSES_LICENSES_PATH: &str =
79    "reference/scancode-toolkit/src/licensedcode/data/licenses";
80
81/// Path to the license data directory in the reference scancode-toolkit submodule.
82/// Used by test code and the xtask generate-license-loader-artifact binary.
83#[allow(dead_code)]
84pub const SCANCODE_LICENSES_DATA_PATH: &str = "reference/scancode-toolkit/src/licensedcode/data";
85
86pub const DEFAULT_LICENSEDB_URL_TEMPLATE: &str = "https://scancode-licensedb.aboutcode.org/{}";
87#[derive(Debug, Clone, thiserror::Error)]
88pub(crate) enum LicenseDetectionError {
89    #[error("license detection timed out")]
90    Timeout,
91}
92
93pub(crate) use detection::{
94    LicenseDetection, group_matches_by_region, post_process_detections, sort_matches_by_line,
95};
96pub use models::LicenseMatch;
97
98pub use aho_match::aho_match;
99pub use hash_match::hash_match;
100pub use match_refine::{
101    filter_invalid_contained_unknown_matches, merge_overlapping_matches, refine_matches,
102    refine_matches_without_false_positive_filter, split_weak_matches,
103};
104pub use position_set::PositionSet;
105pub use spdx_lid::spdx_lid_match;
106pub use token_multiset::TokenMultiset;
107pub use token_set::TokenSet;
108pub use unknown_match::unknown_match;
109
110use self::seq_match::{
111    MAX_NEAR_DUPE_CANDIDATES, select_seq_candidates_with_deadline,
112    seq_match_with_candidates_and_deadline,
113};
114
115/// License detection engine that orchestrates the detection pipeline.
116///
117/// The engine loads license rules and builds an index for efficient matching.
118/// It supports multiple matching strategies (hash, SPDX-LID, Aho-Corasick, sequence)
119/// and combines their results into final license detections.
120#[derive(Debug, Clone)]
121pub struct LicenseDetectionEngine {
122    index: Arc<index::LicenseIndex>,
123    spdx_mapping: SpdxMapping,
124    spdx_license_list_version: Option<String>,
125    license_index_provenance: Option<LicenseIndexProvenance>,
126}
127
128const MAX_DETECTION_SIZE: usize = 10 * 1024 * 1024; // 10MB
129const MAX_REGULAR_SEQ_CANDIDATES: usize = 70;
130const MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP: usize = 8;
131const MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP: usize = 2;
132
133pub(crate) fn deadline_exceeded(deadline: Option<Instant>) -> bool {
134    deadline.is_some_and(|deadline| Instant::now() >= deadline)
135}
136
137pub(crate) fn ensure_within_deadline(
138    deadline: Option<Instant>,
139) -> Result<(), LicenseDetectionError> {
140    if deadline_exceeded(deadline) {
141        Err(LicenseDetectionError::Timeout)
142    } else {
143        Ok(())
144    }
145}
146
147fn truncate_detection_text(clean_text: &str) -> &str {
148    if clean_text.len() <= MAX_DETECTION_SIZE {
149        return clean_text;
150    }
151
152    log::debug!(
153        "Content size {} exceeds limit {}, truncating for detection",
154        clean_text.len(),
155        MAX_DETECTION_SIZE
156    );
157
158    let boundary = clean_text.floor_char_boundary(MAX_DETECTION_SIZE);
159    &clean_text[..boundary]
160}
161
162fn query_span_for_match(m: &LicenseMatch) -> Option<models::PositionSpan> {
163    (!m.query_span().is_empty()).then(|| m.query_span().clone())
164}
165
166fn has_full_match_coverage(m: &LicenseMatch) -> bool {
167    m.coverage() == 100.0
168}
169
170fn is_redundant_same_expression_seq_container(
171    container: &LicenseMatch,
172    candidate_contained_matches: &[LicenseMatch],
173) -> bool {
174    let container_is_redundant_coverage =
175        has_full_match_coverage(container) || container.coverage() >= 99.0;
176    if container.matcher != MatcherKind::Seq || !container_is_redundant_coverage {
177        return false;
178    }
179
180    let container_qspan_set = container.qspan_set();
181
182    let mut contained: Vec<&LicenseMatch> = candidate_contained_matches
183        .iter()
184        .filter(|m| {
185            m.matcher == MatcherKind::Aho
186                && has_full_match_coverage(m)
187                && m.license_expression == container.license_expression
188                && m.overlaps_with(&container_qspan_set)
189        })
190        .collect();
191
192    if contained.len() < 2 {
193        return false;
194    }
195
196    let material_children = contained.iter().filter(|m| m.matched_length > 1).count();
197    if material_children < 2 {
198        return false;
199    }
200
201    contained.sort_by_key(|m| m.qspan_bounds());
202
203    let mut child_union = PositionSet::new();
204    for m in &contained {
205        child_union.extend_from_span(m.query_span());
206    }
207
208    let container_only_positions = container_qspan_set.difference(&child_union);
209    let child_only_positions = child_union.difference(&container_qspan_set);
210
211    let mut bridge_positions = BitSet::new();
212    for pair in contained.windows(2) {
213        let (_, previous_end) = pair[0].qspan_bounds();
214        let (next_start, _) = pair[1].qspan_bounds();
215
216        if next_start < previous_end {
217            return false;
218        }
219
220        for pos in previous_end..next_start {
221            bridge_positions.insert(pos);
222        }
223    }
224
225    let container_only_boundary_positions = container_only_positions
226        .iter()
227        .filter(|&pos| !bridge_positions.contains(pos))
228        .count();
229
230    if container_only_positions.len() == 1
231        && container_only_boundary_positions == 0
232        && child_only_positions.is_empty()
233    {
234        return false;
235    }
236
237    if child_only_positions.is_empty()
238        && container_only_positions.len() == container_only_boundary_positions
239        && container_only_boundary_positions <= 3
240    {
241        let earliest_child = contained
242            .iter()
243            .map(|m| m.qspan_bounds().0)
244            .min()
245            .unwrap_or(usize::MAX);
246        let latest_child = contained
247            .iter()
248            .map(|m| m.qspan_bounds().1.saturating_sub(1))
249            .max()
250            .unwrap_or(0);
251
252        let is_one_sided_boundary = container_only_positions
253            .iter()
254            .all(|pos| pos < earliest_child)
255            || container_only_positions
256                .iter()
257                .all(|pos| pos > latest_child);
258
259        if is_one_sided_boundary {
260            return false;
261        }
262    }
263
264    let max_container_only_positions =
265        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * contained.len() + 1;
266    let max_container_boundary_positions =
267        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * (contained.len() - 1);
268    let max_child_only_positions = MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP + 1;
269
270    container_only_positions.len() <= max_container_only_positions
271        && container_only_boundary_positions <= max_container_boundary_positions
272        && child_only_positions.len() <= max_child_only_positions
273}
274
275fn filter_redundant_same_expression_seq_containers(
276    seq_matches: Vec<LicenseMatch>,
277    candidate_contained_matches: &[LicenseMatch],
278) -> Vec<LicenseMatch> {
279    seq_matches
280        .into_iter()
281        .filter(|m| !is_redundant_same_expression_seq_container(m, candidate_contained_matches))
282        .collect()
283}
284
285fn is_redundant_low_coverage_composite_seq_wrapper(
286    container: &LicenseMatch,
287    candidate_contained_matches: &[LicenseMatch],
288) -> bool {
289    if container.matcher != seq_match::MATCH_SEQ || container.coverage() >= 30.0 {
290        return false;
291    }
292
293    let container_qspan_set = container.qspan_set();
294
295    let children: Vec<&LicenseMatch> = candidate_contained_matches
296        .iter()
297        .filter(|m| {
298            m.matcher == aho_match::MATCH_AHO
299                && has_full_match_coverage(m)
300                && m.license_expression != container.license_expression
301                && m.overlaps_with(&container_qspan_set)
302        })
303        .collect();
304
305    if children.len() < 2 {
306        return false;
307    }
308
309    let unique_expressions: HashSet<&str> = children
310        .iter()
311        .map(|m| m.license_expression.as_str())
312        .collect();
313    if unique_expressions.len() < 2 {
314        return false;
315    }
316
317    let mut child_union = PositionSet::new();
318    for m in &children {
319        child_union.extend_from_span(m.query_span());
320    }
321
322    let container_only_positions = container_qspan_set.difference(&child_union);
323    let child_only_positions = child_union.difference(&container_qspan_set);
324
325    let mut sorted_children = children;
326    sorted_children.sort_by_key(|m| m.qspan_bounds());
327
328    let mut bridge_positions = BitSet::new();
329    for pair in sorted_children.windows(2) {
330        let (_, previous_end) = pair[0].qspan_bounds();
331        let (next_start, _) = pair[1].qspan_bounds();
332        for pos in previous_end..next_start {
333            bridge_positions.insert(pos);
334        }
335    }
336
337    let container_only_boundary_positions = container_only_positions
338        .iter()
339        .filter(|&pos| !bridge_positions.contains(pos))
340        .count();
341
342    child_only_positions.is_empty()
343        && container_only_positions.len() <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
344        && container_only_boundary_positions <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
345}
346
347fn filter_redundant_low_coverage_composite_seq_wrappers(
348    seq_matches: Vec<LicenseMatch>,
349    candidate_contained_matches: &[LicenseMatch],
350) -> Vec<LicenseMatch> {
351    seq_matches
352        .into_iter()
353        .filter(|m| {
354            !is_redundant_low_coverage_composite_seq_wrapper(m, candidate_contained_matches)
355        })
356        .collect()
357}
358
359fn subtract_spdx_match_qspans(
360    query: &mut Query<'_>,
361    matched_qspans: &mut Vec<models::PositionSpan>,
362    aho_extra_matchables: &mut PositionSet,
363    spdx_matches: &[LicenseMatch],
364) {
365    for m in spdx_matches {
366        let Some(span) = query_span_for_match(m) else {
367            continue;
368        };
369
370        aho_extra_matchables.extend_from_span(&span);
371        query.subtract(&span);
372
373        if has_full_match_coverage(m) {
374            matched_qspans.push(span);
375        }
376    }
377}
378
379fn merge_and_prepare_aho_matches(
380    index: &index::LicenseIndex,
381    query: &mut Query<'_>,
382    matched_qspans: &mut Vec<models::PositionSpan>,
383    refined_aho: &[LicenseMatch],
384) -> (Vec<LicenseMatch>, bool) {
385    let merged_aho = merge_overlapping_matches(refined_aho);
386    let mut saw_long_exact_license_text_match = false;
387
388    for m in &merged_aho {
389        let Some(span) = query_span_for_match(m) else {
390            continue;
391        };
392
393        if has_full_match_coverage(m) {
394            matched_qspans.push(span.clone());
395        }
396
397        if index.rule(m.rid).is_some_and(|rule| rule.is_license_text())
398            && m.rule_length > 120
399            && m.coverage() > 98.0
400        {
401            query.subtract(&span);
402            saw_long_exact_license_text_match = true;
403        }
404    }
405
406    (merged_aho, saw_long_exact_license_text_match)
407}
408
409fn collect_whole_query_exact_followup_matches(
410    index: &index::LicenseIndex,
411    query: &mut Query<'_>,
412    matched_qspans: &mut Vec<models::PositionSpan>,
413    whole_run: &query::QueryRun<'_>,
414    deadline: Option<Instant>,
415) -> Result<Vec<LicenseMatch>, LicenseDetectionError> {
416    let mut seq_all_matches = Vec::new();
417
418    if whole_run.is_matchable(false, matched_qspans) {
419        let near_dupe_candidates = if deadline.is_some() {
420            select_seq_candidates_with_deadline(
421                index,
422                whole_run,
423                true,
424                MAX_NEAR_DUPE_CANDIDATES,
425                deadline,
426            )?
427        } else {
428            self::seq_match::select_seq_candidates(index, whole_run, true, MAX_NEAR_DUPE_CANDIDATES)
429        };
430
431        if !near_dupe_candidates.is_empty() {
432            let near_dupe_matches = if deadline.is_some() {
433                seq_match_with_candidates_and_deadline(
434                    index,
435                    whole_run,
436                    &near_dupe_candidates,
437                    deadline,
438                )?
439            } else {
440                self::seq_match::seq_match_with_candidates(index, whole_run, &near_dupe_candidates)
441            };
442
443            for m in &near_dupe_matches {
444                if !m.query_span().is_empty() {
445                    let span = m.query_span().clone();
446                    query.subtract(&span);
447                    matched_qspans.push(span);
448                }
449            }
450
451            seq_all_matches.extend(near_dupe_matches);
452        }
453    }
454
455    Ok(seq_all_matches)
456}
457
458fn collect_regular_seq_matches(
459    index: &index::LicenseIndex,
460    query: &Query<'_>,
461    matched_qspans: &[models::PositionSpan],
462    candidate_contained_matches: &[LicenseMatch],
463    deadline: Option<Instant>,
464) -> Result<Vec<LicenseMatch>, LicenseDetectionError> {
465    let mut seq_all_matches = Vec::new();
466
467    for (query_run_index, query_run) in query.query_runs().into_iter().enumerate() {
468        if query_run_index % 8 == 0 {
469            ensure_within_deadline(deadline)?;
470        }
471
472        if !query_run.is_matchable(false, matched_qspans) {
473            continue;
474        }
475
476        let candidates = if deadline.is_some() {
477            select_seq_candidates_with_deadline(
478                index,
479                &query_run,
480                false,
481                MAX_REGULAR_SEQ_CANDIDATES,
482                deadline,
483            )?
484        } else {
485            self::seq_match::select_seq_candidates(
486                index,
487                &query_run,
488                false,
489                MAX_REGULAR_SEQ_CANDIDATES,
490            )
491        };
492        if !candidates.is_empty() {
493            let matches = if deadline.is_some() {
494                seq_match_with_candidates_and_deadline(index, &query_run, &candidates, deadline)?
495            } else {
496                self::seq_match::seq_match_with_candidates(index, &query_run, &candidates)
497            };
498            seq_all_matches.extend(matches);
499        }
500    }
501
502    let merged_seq = merge_overlapping_matches(&seq_all_matches);
503    let filtered_same_expression =
504        filter_redundant_same_expression_seq_containers(merged_seq, candidate_contained_matches);
505    Ok(filter_redundant_low_coverage_composite_seq_wrappers(
506        filtered_same_expression,
507        candidate_contained_matches,
508    ))
509}
510
511impl LicenseDetectionEngine {
512    /// Create a new license detection engine from a pre-built license index.
513    ///
514    /// This is an internal constructor used by `from_directory()` and `from_embedded()`.
515    /// It builds the SPDX mapping from the licenses in the index.
516    fn from_index(
517        index: index::LicenseIndex,
518        spdx_license_list_version: Option<String>,
519        license_index_provenance: Option<LicenseIndexProvenance>,
520    ) -> Result<Self> {
521        let mut license_vec: Vec<_> = index.licenses_by_key.values().cloned().collect();
522        license_vec.sort_by(|a, b| a.key.cmp(&b.key));
523        let spdx_mapping = build_spdx_mapping(&license_vec);
524
525        Ok(Self {
526            index: Arc::new(index),
527            spdx_mapping,
528            spdx_license_list_version,
529            license_index_provenance,
530        })
531    }
532
533    #[cfg(test)]
534    pub(crate) fn from_test_index(index: index::LicenseIndex) -> Self {
535        Self::from_index(index, None, None).expect("test index should build license engine")
536    }
537
538    /// Create a new license detection engine from the embedded license index.
539    ///
540    /// Convenience method that uses the default Provenant cache root and does
541    /// not force a reindex.
542    pub fn from_embedded() -> Result<Self> {
543        let cache_config =
544            LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
545        Self::from_embedded_with_cache(&cache_config)
546    }
547
548    /// Create a new license detection engine from the embedded license index.
549    ///
550    /// This method loads the build-time embedded license artifact and constructs
551    /// the runtime license index. This eliminates the runtime dependency on the
552    /// ScanCode rules directory.
553    ///
554    /// If a valid cache exists (matching fingerprint), the index is loaded from
555    /// the rkyv cache file instead of being rebuilt from scratch.
556    ///
557    /// # Arguments
558    /// * `cache_config` - Cache configuration (directory and reindex flag)
559    ///
560    /// # Returns
561    /// A Result containing the engine or an error
562    pub fn from_embedded_with_cache(cache_config: &LicenseCacheConfig) -> Result<Self> {
563        let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
564        let fingerprint = compute_artifact_fingerprint(artifact_bytes);
565        let artifact_metadata = load_embedded_artifact_metadata_from_bytes(artifact_bytes)
566            .map_err(|e| {
567                anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
568            })?;
569        debug_assert_eq!(
570            artifact_metadata.license_index_provenance.source,
571            EMBEDDED_LICENSE_INDEX_SOURCE
572        );
573        let spdx_version = Some(artifact_metadata.spdx_license_list_version.clone());
574        let provenance = Some(artifact_metadata.license_index_provenance.clone());
575
576        if !cache_config.reindex {
577            if let Some(cached) =
578                load_cached_index(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?
579            {
580                let start = Instant::now();
581                eprintln!(
582                    "License index loaded from rkyv cache in {:.2}s",
583                    start.elapsed().as_secs_f64()
584                );
585                return Self::from_index(cached, spdx_version, provenance);
586            }
587        } else {
588            delete_cache(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?;
589        }
590
591        let snapshot = load_loader_snapshot_from_bytes(artifact_bytes)
592            .map_err(|e| anyhow::anyhow!("Failed to load embedded license index: {}", e))?;
593        let spdx_version = Some(snapshot.metadata.spdx_license_list_version.clone());
594        let provenance = Some(snapshot.metadata.license_index_provenance.clone());
595
596        let start = Instant::now();
597        let index = build_index_from_loaded(snapshot.rules, snapshot.licenses, false);
598        eprintln!(
599            "License index built from embedded artifact in {:.2}s",
600            start.elapsed().as_secs_f64()
601        );
602
603        let mut index = index;
604        index.spdx_license_list_version = spdx_version.clone();
605        if let Err(e) = save_cached_index(
606            cache_config,
607            LicenseCacheNamespace::Embedded,
608            &index,
609            &fingerprint,
610        ) {
611            eprintln!("Warning: failed to save license index cache: {}", e);
612        } else if let Some(size) =
613            cache_file_size(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)
614        {
615            eprintln!(
616                "License index cache saved ({:.1} MB)",
617                size as f64 / 1_048_576.0
618            );
619        }
620
621        Self::from_index(index, spdx_version, provenance)
622    }
623
624    /// Create a new license detection engine from a license dataset root.
625    ///
626    /// Convenience method that uses the default Provenant cache root and does
627    /// not force a reindex.
628    pub fn from_directory(rules_path: &Path) -> Result<Self> {
629        let cache_config =
630            LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
631        Self::from_directory_with_cache(rules_path, &cache_config)
632    }
633
634    /// Create a new license detection engine from a directory of license rules.
635    ///
636    /// If a valid cache exists (matching fingerprint of the dataset), the index is
637    /// loaded from the rkyv cache file instead of being rebuilt from scratch.
638    ///
639    /// # Arguments
640    /// * `rules_path` - Path to dataset root containing rules/ and licenses/
641    /// * `cache_config` - Cache configuration (directory and reindex flag)
642    ///
643    /// # Returns
644    /// A Result containing the engine or an error
645    pub fn from_directory_with_cache(
646        rules_path: &Path,
647        cache_config: &LicenseCacheConfig,
648    ) -> Result<Self> {
649        let LoadedLicenseDataset {
650            manifest,
651            rules: loaded_rules,
652            licenses: loaded_licenses,
653        } = load_license_dataset_from_root(rules_path)?;
654
655        let fingerprint = compute_rules_fingerprint(&loaded_rules, &loaded_licenses)?;
656        let provenance = Some(LicenseIndexProvenance {
657            source: CUSTOM_LICENSE_DATASET_SOURCE.to_string(),
658            dataset_fingerprint: compute_dataset_fingerprint_string(
659                &loaded_rules,
660                &loaded_licenses,
661            )?,
662            ignored_rules: vec![],
663            ignored_licenses: vec![],
664            ignored_rules_due_to_licenses: vec![],
665            added_rules: vec![],
666            replaced_rules: vec![],
667            added_licenses: vec![],
668            replaced_licenses: vec![],
669        });
670
671        if !cache_config.reindex {
672            if let Some(cached) = load_cached_index(
673                cache_config,
674                LicenseCacheNamespace::CustomRules,
675                &fingerprint,
676            )? {
677                let start = Instant::now();
678                eprintln!(
679                    "License index loaded from rkyv cache in {:.2}s",
680                    start.elapsed().as_secs_f64()
681                );
682                return Self::from_index(
683                    cached,
684                    Some(manifest.spdx_license_list_version),
685                    provenance,
686                );
687            }
688        } else {
689            delete_cache(
690                cache_config,
691                LicenseCacheNamespace::CustomRules,
692                &fingerprint,
693            )?;
694        }
695
696        let start = Instant::now();
697        let index = build_index_from_loaded(loaded_rules, loaded_licenses, false);
698        eprintln!(
699            "License index built from custom dataset in {:.2}s",
700            start.elapsed().as_secs_f64()
701        );
702
703        if let Err(e) = save_cached_index(
704            cache_config,
705            LicenseCacheNamespace::CustomRules,
706            &index,
707            &fingerprint,
708        ) {
709            eprintln!("Warning: failed to save license index cache: {}", e);
710        } else if let Some(size) = cache_file_size(
711            cache_config,
712            LicenseCacheNamespace::CustomRules,
713            &fingerprint,
714        ) {
715            eprintln!(
716                "License index cache saved ({:.1} MB)",
717                size as f64 / 1_048_576.0
718            );
719        }
720
721        Self::from_index(index, Some(manifest.spdx_license_list_version), provenance)
722    }
723
724    pub fn embedded_spdx_license_list_version() -> Result<String> {
725        let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
726        Ok(load_embedded_artifact_metadata_from_bytes(artifact_bytes)
727            .map_err(|e| {
728                anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
729            })?
730            .spdx_license_list_version)
731    }
732
733    pub fn detect_with_kind(
734        &self,
735        text: &str,
736        unknown_licenses: bool,
737        binary_derived: bool,
738    ) -> Result<Vec<LicenseDetection>> {
739        self.detect_with_kind_with_score_and_deadline(
740            text,
741            unknown_licenses,
742            binary_derived,
743            0.0,
744            None,
745        )
746        .map_err(Into::into)
747    }
748
749    pub fn detect_with_kind_with_score(
750        &self,
751        text: &str,
752        unknown_licenses: bool,
753        binary_derived: bool,
754        min_score: f32,
755    ) -> Result<Vec<LicenseDetection>> {
756        self.detect_with_kind_with_score_and_deadline(
757            text,
758            unknown_licenses,
759            binary_derived,
760            min_score,
761            None,
762        )
763        .map_err(Into::into)
764    }
765
766    pub(crate) fn detect_with_kind_with_score_and_deadline(
767        &self,
768        text: &str,
769        unknown_licenses: bool,
770        binary_derived: bool,
771        min_score: f32,
772        deadline: Option<Instant>,
773    ) -> Result<Vec<LicenseDetection>, LicenseDetectionError> {
774        ensure_within_deadline(deadline)?;
775        let clean_text = strip_utf8_bom_str(text);
776
777        let content = truncate_detection_text(clean_text);
778
779        ensure_within_deadline(deadline)?;
780        let mut query = if deadline.is_some() {
781            Query::from_extracted_text_with_deadline(
782                content,
783                &self.index,
784                binary_derived,
785                deadline,
786            )?
787        } else {
788            Query::from_extracted_text(content, &self.index, binary_derived)?
789        };
790        let whole_query_run = query.whole_query_run();
791
792        let mut all_matches = Vec::new();
793        let mut candidate_contained_matches = Vec::new();
794        let mut aho_extra_matchables = PositionSet::new();
795        let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
796
797        // Phase 1a: Hash matching
798        // Python returns immediately if hash matches found (index.py:987-991)
799        {
800            ensure_within_deadline(deadline)?;
801            let hash_matches = hash_match(&self.index, &whole_query_run);
802
803            if !hash_matches.is_empty() {
804                let mut matches = hash_matches;
805                sort_matches_by_line(&mut matches);
806
807                let groups = split_groups_across_frontmatter_boundary(
808                    group_matches_by_region(&matches),
809                    Some(content),
810                );
811                let detections: Vec<LicenseDetection> = groups
812                    .iter()
813                    .map(|group| {
814                        let mut detection = empty_detection();
815                        populate_detection_from_group_with_spdx(
816                            &mut detection,
817                            group,
818                            &self.spdx_mapping,
819                            Some(content),
820                        );
821                        detection
822                    })
823                    .collect();
824
825                return Ok(post_process_detections(detections, min_score));
826            }
827        }
828
829        // Phase 1b: SPDX-LID matching
830        {
831            ensure_within_deadline(deadline)?;
832            let spdx_matches = spdx_lid_match(&self.index, &query);
833            subtract_spdx_match_qspans(
834                &mut query,
835                &mut matched_qspans,
836                &mut aho_extra_matchables,
837                &spdx_matches,
838            );
839            all_matches.extend(spdx_matches);
840        }
841
842        // Phase 1c: Aho-Corasick matching
843        {
844            ensure_within_deadline(deadline)?;
845            let aho_matches = if aho_extra_matchables.is_empty() {
846                if deadline.is_some() {
847                    aho_match::aho_match_with_deadline(&self.index, &whole_query_run, deadline)?
848                } else {
849                    aho_match(&self.index, &whole_query_run)
850                }
851            } else {
852                if deadline.is_some() {
853                    aho_match::aho_match_with_extra_matchables(
854                        &self.index,
855                        &whole_query_run,
856                        Some(&aho_extra_matchables),
857                        deadline,
858                    )?
859                } else {
860                    aho_match::aho_match_with_extra_matchables(
861                        &self.index,
862                        &whole_query_run,
863                        Some(&aho_extra_matchables),
864                        None,
865                    )?
866                }
867            };
868
869            // Python's get_exact_matches() calls refine_matches with merge=False
870            // This applies quality filters including required phrase filtering
871            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
872            candidate_contained_matches.extend(refined_aho.clone());
873            let (merged_aho, _) = merge_and_prepare_aho_matches(
874                &self.index,
875                &mut query,
876                &mut matched_qspans,
877                &refined_aho,
878            );
879            all_matches.extend(merged_aho);
880
881            let whole_query_followup = collect_whole_query_exact_followup_matches(
882                &self.index,
883                &mut query,
884                &mut matched_qspans,
885                &whole_query_run,
886                deadline,
887            )?;
888            all_matches.extend(whole_query_followup);
889
890            let merged_seq = collect_regular_seq_matches(
891                &self.index,
892                &query,
893                &matched_qspans,
894                &candidate_contained_matches,
895                deadline,
896            )?;
897            all_matches.extend(merged_seq);
898        }
899
900        // Step 1: Initial refine WITHOUT false positive filtering
901        // Python: refine_matches with filter_false_positive=False (index.py:1073-1080)
902        ensure_within_deadline(deadline)?;
903        let merged_matches =
904            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
905
906        // Step 2: Unknown detection and weak match handling
907        // Python: index.py:1079-1118 - only runs when unknown_licenses=True
908        let refined_matches = if unknown_licenses {
909            // Split weak from good - Python: index.py:1083
910            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
911
912            // Unknown detection on uncovered regions - Python: index.py:1093-1114
913            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
914            let filtered_unknown =
915                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
916
917            let mut all_matches = good_matches;
918            all_matches.extend(filtered_unknown);
919            // reinject weak matches and let refine matches keep the bests
920            // Python: index.py:1117-1118
921            all_matches.extend(weak_matches);
922            all_matches
923        } else {
924            merged_matches
925        };
926
927        // Step 5: Final refine WITH false positive filtering - Python: index.py:1130-1145
928        ensure_within_deadline(deadline)?;
929        let refined = refine_matches(&self.index, refined_matches, &query);
930
931        let mut sorted = refined;
932        sort_matches_by_line(&mut sorted);
933
934        let groups = split_groups_across_frontmatter_boundary(
935            group_matches_by_region(&sorted),
936            Some(content),
937        );
938
939        let detections: Vec<LicenseDetection> = groups
940            .iter()
941            .map(|group| {
942                let mut detection = empty_detection();
943                populate_detection_from_group_with_spdx(
944                    &mut detection,
945                    group,
946                    &self.spdx_mapping,
947                    Some(content),
948                );
949                detection
950            })
951            .collect();
952
953        let detections = post_process_detections(detections, min_score);
954
955        ensure_within_deadline(deadline)?;
956        Ok(detections)
957    }
958
959    pub fn detect_with_kind_and_source(
960        &self,
961        text: &str,
962        unknown_licenses: bool,
963        binary_derived: bool,
964        source_path: &str,
965    ) -> Result<Vec<LicenseDetection>> {
966        self.detect_with_kind_and_source_with_deadline(
967            text,
968            unknown_licenses,
969            binary_derived,
970            source_path,
971            None,
972        )
973        .map_err(Into::into)
974    }
975
976    pub(crate) fn detect_with_kind_and_source_with_deadline(
977        &self,
978        text: &str,
979        unknown_licenses: bool,
980        binary_derived: bool,
981        source_path: &str,
982        deadline: Option<Instant>,
983    ) -> Result<Vec<LicenseDetection>, LicenseDetectionError> {
984        let mut detections = self.detect_with_kind_with_score_and_deadline(
985            text,
986            unknown_licenses,
987            binary_derived,
988            0.0,
989            deadline,
990        )?;
991        attach_source_path_to_detections(&mut detections, source_path);
992        Ok(detections)
993    }
994
995    pub fn detect_with_kind_and_source_with_score(
996        &self,
997        text: &str,
998        unknown_licenses: bool,
999        binary_derived: bool,
1000        source_path: &str,
1001        min_score: f32,
1002    ) -> Result<Vec<LicenseDetection>> {
1003        let mut detections =
1004            self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, min_score)?;
1005        attach_source_path_to_detections(&mut detections, source_path);
1006        Ok(detections)
1007    }
1008
1009    pub(crate) fn detect_with_kind_and_source_with_score_and_deadline(
1010        &self,
1011        text: &str,
1012        unknown_licenses: bool,
1013        binary_derived: bool,
1014        source_path: &str,
1015        min_score: f32,
1016        deadline: Option<Instant>,
1017    ) -> Result<Vec<LicenseDetection>, LicenseDetectionError> {
1018        let mut detections = self.detect_with_kind_with_score_and_deadline(
1019            text,
1020            unknown_licenses,
1021            binary_derived,
1022            min_score,
1023            deadline,
1024        )?;
1025        attach_source_path_to_detections(&mut detections, source_path);
1026        Ok(detections)
1027    }
1028
1029    /// Detect licenses and return raw matches (like Python's idx.match()).
1030    ///
1031    /// This is primarily used by golden tests and maintenance tooling that need
1032    /// raw match sequences before grouping or post-processing into detections.
1033    #[cfg(any(test, feature = "golden-tests"))]
1034    pub fn detect_matches_with_kind(
1035        &self,
1036        text: &str,
1037        unknown_licenses: bool,
1038        binary_derived: bool,
1039    ) -> Result<Vec<LicenseMatch>> {
1040        let clean_text = strip_utf8_bom_str(text);
1041
1042        let content = truncate_detection_text(clean_text);
1043
1044        let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
1045        let whole_query_run = query.whole_query_run();
1046
1047        let mut all_matches = Vec::new();
1048        let mut candidate_contained_matches = Vec::new();
1049        let mut aho_extra_matchables = PositionSet::new();
1050        let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
1051
1052        // Phase 1a: Hash matching
1053        {
1054            let hash_matches = hash_match(&self.index, &whole_query_run);
1055
1056            if !hash_matches.is_empty() {
1057                let mut matches = hash_matches;
1058                sort_matches_by_line(&mut matches);
1059                return Ok(matches);
1060            }
1061        }
1062
1063        // Phase 1b: SPDX-LID matching
1064        {
1065            let spdx_matches = spdx_lid_match(&self.index, &query);
1066            subtract_spdx_match_qspans(
1067                &mut query,
1068                &mut matched_qspans,
1069                &mut aho_extra_matchables,
1070                &spdx_matches,
1071            );
1072            all_matches.extend(spdx_matches);
1073        }
1074
1075        // Phase 1c: Aho-Corasick matching
1076        {
1077            let aho_matches = if aho_extra_matchables.is_empty() {
1078                aho_match(&self.index, &whole_query_run)
1079            } else {
1080                aho_match::aho_match_with_extra_matchables(
1081                    &self.index,
1082                    &whole_query_run,
1083                    Some(&aho_extra_matchables),
1084                    None,
1085                )?
1086            };
1087            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
1088            candidate_contained_matches.extend(refined_aho.clone());
1089            let (merged_aho, _) = merge_and_prepare_aho_matches(
1090                &self.index,
1091                &mut query,
1092                &mut matched_qspans,
1093                &refined_aho,
1094            );
1095            all_matches.extend(merged_aho);
1096
1097            let whole_query_followup = collect_whole_query_exact_followup_matches(
1098                &self.index,
1099                &mut query,
1100                &mut matched_qspans,
1101                &whole_query_run,
1102                None,
1103            )?;
1104            all_matches.extend(whole_query_followup);
1105
1106            let merged_seq = collect_regular_seq_matches(
1107                &self.index,
1108                &query,
1109                &matched_qspans,
1110                &candidate_contained_matches,
1111                None,
1112            )?;
1113            all_matches.extend(merged_seq);
1114        }
1115
1116        // Step 1: Initial refine WITHOUT false positive filtering
1117        let merged_matches =
1118            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
1119
1120        // Step 2: Unknown detection and weak match handling
1121        let refined_matches = if unknown_licenses {
1122            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
1123            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
1124            let filtered_unknown =
1125                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
1126
1127            let mut all_matches = good_matches;
1128            all_matches.extend(filtered_unknown);
1129            all_matches.extend(weak_matches);
1130            all_matches
1131        } else {
1132            merged_matches
1133        };
1134
1135        // Step 3: Final refine WITH false positive filtering - Python: index.py:1130-1145
1136        let refined = refine_matches(&self.index, refined_matches, &query);
1137
1138        let mut sorted = refined;
1139        sort_matches_by_line(&mut sorted);
1140
1141        // Return raw matches (NOT grouped) - this is Python's idx.match() behavior
1142        Ok(sorted)
1143    }
1144
1145    /// Get a reference to the license index.
1146    pub fn index(&self) -> &index::LicenseIndex {
1147        &self.index
1148    }
1149
1150    pub fn spdx_license_list_version(&self) -> Option<&str> {
1151        self.spdx_license_list_version.as_deref()
1152    }
1153
1154    pub fn license_index_provenance(&self) -> Option<&LicenseIndexProvenance> {
1155        self.license_index_provenance.as_ref()
1156    }
1157
1158    /// Get a reference to the SPDX mapping.
1159    #[cfg(test)]
1160    pub fn spdx_mapping(&self) -> &SpdxMapping {
1161        &self.spdx_mapping
1162    }
1163}
1164
1165pub fn detect_scancode_spdx_license_list_version(search_path: &Path) -> Result<Option<String>> {
1166    for ancestor in search_path.ancestors() {
1167        let candidate = ancestor.join("scancode_config.py");
1168        if candidate.is_file() {
1169            let config = fs::read_to_string(&candidate)?;
1170            return Ok(parse_scancode_spdx_license_list_version(&config));
1171        }
1172    }
1173
1174    Ok(None)
1175}
1176
1177fn parse_scancode_spdx_license_list_version(config: &str) -> Option<String> {
1178    config.lines().find_map(|line| {
1179        let trimmed = line.trim();
1180        let (_, value) = trimmed.split_once('=')?;
1181        (trimmed.starts_with("spdx_license_list_version")).then(|| {
1182            value
1183                .trim()
1184                .trim_matches('"')
1185                .trim_matches('\'')
1186                .to_string()
1187        })
1188    })
1189}
1190
1191#[cfg(test)]
1192mod tests;