Skip to main content

provenant/license_detection/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! License Detection Engine
5
6pub mod aho_match;
7pub mod automaton;
8pub mod build_policy;
9pub mod dataset;
10pub(crate) mod detection;
11pub mod embedded;
12pub mod license_cache;
13mod position_set;
14mod token_multiset;
15mod token_set;
16
17#[cfg(test)]
18mod embedded_test;
19pub mod expression;
20#[cfg(all(test, feature = "golden-tests"))]
21mod golden_test;
22#[cfg(feature = "golden-tests")]
23pub mod golden_utils;
24pub mod hash_match;
25pub mod index;
26mod match_refine;
27pub mod models;
28pub mod query;
29pub mod rules;
30pub mod seq_match;
31pub mod spdx_lid;
32pub mod spdx_mapping;
33#[cfg(test)]
34mod test_utils;
35pub mod tokenize;
36pub mod unknown_match;
37
38use bit_set::BitSet;
39use std::collections::HashSet;
40use std::fs;
41use std::path::Path;
42use std::sync::Arc;
43use std::time::Instant;
44
45use anyhow::Result;
46
47use crate::license_detection::build_policy::EMBEDDED_LICENSE_INDEX_SOURCE;
48use crate::license_detection::dataset::{
49    CUSTOM_LICENSE_DATASET_SOURCE, LoadedLicenseDataset, compute_dataset_fingerprint_string,
50    load_license_dataset_from_root,
51};
52use crate::license_detection::embedded::index::{
53    load_embedded_artifact_metadata_from_bytes, load_loader_snapshot_from_bytes,
54};
55use crate::license_detection::index::build_index_from_loaded;
56use crate::license_detection::license_cache::{
57    LicenseCacheConfig, LicenseCacheNamespace, cache_file_size, compute_artifact_fingerprint,
58    compute_rules_fingerprint, delete_cache, load_cached_index, save_cached_index,
59};
60use crate::license_detection::query::Query;
61use crate::license_detection::spdx_mapping::{SpdxMapping, build_spdx_mapping};
62use crate::models::LicenseIndexProvenance;
63use crate::utils::text::strip_utf8_bom_str;
64
65use crate::license_detection::detection::{
66    attach_source_path_to_detections, empty_detection, populate_detection_from_group_with_spdx,
67};
68use crate::license_detection::models::MatcherKind;
69
70/// Path to the license rules directory in the reference scancode-toolkit submodule.
71/// Used by test code and the xtask generate-license-loader-artifact binary.
72#[allow(dead_code)]
73pub const SCANCODE_LICENSES_RULES_PATH: &str =
74    "reference/scancode-toolkit/src/licensedcode/data/rules";
75
76/// Path to the licenses directory in the reference scancode-toolkit submodule.
77/// Used by test code and the xtask generate-license-loader-artifact binary.
78#[allow(dead_code)]
79pub const SCANCODE_LICENSES_LICENSES_PATH: &str =
80    "reference/scancode-toolkit/src/licensedcode/data/licenses";
81
82/// Path to the license data directory in the reference scancode-toolkit submodule.
83/// Used by test code and the xtask generate-license-loader-artifact binary.
84#[allow(dead_code)]
85pub const SCANCODE_LICENSES_DATA_PATH: &str = "reference/scancode-toolkit/src/licensedcode/data";
86
87pub const DEFAULT_LICENSEDB_URL_TEMPLATE: &str = "https://scancode-licensedb.aboutcode.org/{}";
88pub(crate) const LICENSE_DETECTION_TIMEOUT_MESSAGE: &str = "license detection timed out";
89
90pub(crate) use detection::{
91    LicenseDetection, group_matches_by_region, post_process_detections, sort_matches_by_line,
92};
93pub use models::LicenseMatch;
94
95pub use aho_match::aho_match;
96pub use hash_match::hash_match;
97pub use match_refine::{
98    filter_invalid_contained_unknown_matches, merge_overlapping_matches, refine_matches,
99    refine_matches_without_false_positive_filter, split_weak_matches,
100};
101pub use position_set::PositionSet;
102pub use spdx_lid::spdx_lid_match;
103pub use token_multiset::TokenMultiset;
104pub use token_set::TokenSet;
105pub use unknown_match::unknown_match;
106
107use self::seq_match::{
108    MAX_NEAR_DUPE_CANDIDATES, select_seq_candidates_with_deadline,
109    seq_match_with_candidates_and_deadline,
110};
111
112/// License detection engine that orchestrates the detection pipeline.
113///
114/// The engine loads license rules and builds an index for efficient matching.
115/// It supports multiple matching strategies (hash, SPDX-LID, Aho-Corasick, sequence)
116/// and combines their results into final license detections.
117#[derive(Debug, Clone)]
118pub struct LicenseDetectionEngine {
119    index: Arc<index::LicenseIndex>,
120    spdx_mapping: SpdxMapping,
121    spdx_license_list_version: Option<String>,
122    license_index_provenance: Option<LicenseIndexProvenance>,
123}
124
125const MAX_DETECTION_SIZE: usize = 10 * 1024 * 1024; // 10MB
126const MAX_REGULAR_SEQ_CANDIDATES: usize = 70;
127const MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP: usize = 8;
128const MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP: usize = 2;
129
130pub(crate) fn deadline_exceeded(deadline: Option<Instant>) -> bool {
131    deadline.is_some_and(|deadline| Instant::now() >= deadline)
132}
133
134pub(crate) fn ensure_within_deadline(deadline: Option<Instant>) -> Result<()> {
135    if deadline_exceeded(deadline) {
136        Err(anyhow::anyhow!(LICENSE_DETECTION_TIMEOUT_MESSAGE))
137    } else {
138        Ok(())
139    }
140}
141
142fn truncate_detection_text(clean_text: &str) -> &str {
143    if clean_text.len() <= MAX_DETECTION_SIZE {
144        return clean_text;
145    }
146
147    log::debug!(
148        "Content size {} exceeds limit {}, truncating for detection",
149        clean_text.len(),
150        MAX_DETECTION_SIZE
151    );
152
153    let boundary = clean_text.floor_char_boundary(MAX_DETECTION_SIZE);
154    &clean_text[..boundary]
155}
156
157fn query_span_for_match(m: &LicenseMatch) -> Option<models::PositionSpan> {
158    (!m.query_span().is_empty()).then(|| m.query_span().clone())
159}
160
161fn has_full_match_coverage(m: &LicenseMatch) -> bool {
162    m.coverage() == 100.0
163}
164
165fn is_redundant_same_expression_seq_container(
166    container: &LicenseMatch,
167    candidate_contained_matches: &[LicenseMatch],
168) -> bool {
169    let container_is_redundant_coverage =
170        has_full_match_coverage(container) || container.coverage() >= 99.0;
171    if container.matcher != MatcherKind::Seq || !container_is_redundant_coverage {
172        return false;
173    }
174
175    let container_qspan_set = container.qspan_set();
176
177    let mut contained: Vec<&LicenseMatch> = candidate_contained_matches
178        .iter()
179        .filter(|m| {
180            m.matcher == MatcherKind::Aho
181                && has_full_match_coverage(m)
182                && m.license_expression == container.license_expression
183                && m.overlaps_with(&container_qspan_set)
184        })
185        .collect();
186
187    if contained.len() < 2 {
188        return false;
189    }
190
191    let material_children = contained.iter().filter(|m| m.matched_length > 1).count();
192    if material_children < 2 {
193        return false;
194    }
195
196    contained.sort_by_key(|m| m.qspan_bounds());
197
198    let mut child_union = PositionSet::new();
199    for m in &contained {
200        child_union.extend_from_span(m.query_span());
201    }
202
203    let container_only_positions = container_qspan_set.difference(&child_union);
204    let child_only_positions = child_union.difference(&container_qspan_set);
205
206    let mut bridge_positions = BitSet::new();
207    for pair in contained.windows(2) {
208        let (_, previous_end) = pair[0].qspan_bounds();
209        let (next_start, _) = pair[1].qspan_bounds();
210
211        if next_start < previous_end {
212            return false;
213        }
214
215        for pos in previous_end..next_start {
216            bridge_positions.insert(pos);
217        }
218    }
219
220    let container_only_boundary_positions = container_only_positions
221        .iter()
222        .filter(|&pos| !bridge_positions.contains(pos))
223        .count();
224
225    if container_only_positions.len() == 1
226        && container_only_boundary_positions == 0
227        && child_only_positions.is_empty()
228    {
229        return false;
230    }
231
232    if child_only_positions.is_empty()
233        && container_only_positions.len() == container_only_boundary_positions
234        && container_only_boundary_positions <= 3
235    {
236        let earliest_child = contained
237            .iter()
238            .map(|m| m.qspan_bounds().0)
239            .min()
240            .unwrap_or(usize::MAX);
241        let latest_child = contained
242            .iter()
243            .map(|m| m.qspan_bounds().1.saturating_sub(1))
244            .max()
245            .unwrap_or(0);
246
247        let is_one_sided_boundary = container_only_positions
248            .iter()
249            .all(|pos| pos < earliest_child)
250            || container_only_positions
251                .iter()
252                .all(|pos| pos > latest_child);
253
254        if is_one_sided_boundary {
255            return false;
256        }
257    }
258
259    let max_container_only_positions =
260        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * contained.len() + 1;
261    let max_container_boundary_positions =
262        MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * (contained.len() - 1);
263    let max_child_only_positions = MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP + 1;
264
265    container_only_positions.len() <= max_container_only_positions
266        && container_only_boundary_positions <= max_container_boundary_positions
267        && child_only_positions.len() <= max_child_only_positions
268}
269
270fn filter_redundant_same_expression_seq_containers(
271    seq_matches: Vec<LicenseMatch>,
272    candidate_contained_matches: &[LicenseMatch],
273) -> Vec<LicenseMatch> {
274    seq_matches
275        .into_iter()
276        .filter(|m| !is_redundant_same_expression_seq_container(m, candidate_contained_matches))
277        .collect()
278}
279
280fn is_redundant_low_coverage_composite_seq_wrapper(
281    container: &LicenseMatch,
282    candidate_contained_matches: &[LicenseMatch],
283) -> bool {
284    if container.matcher != seq_match::MATCH_SEQ || container.coverage() >= 30.0 {
285        return false;
286    }
287
288    let container_qspan_set = container.qspan_set();
289
290    let children: Vec<&LicenseMatch> = candidate_contained_matches
291        .iter()
292        .filter(|m| {
293            m.matcher == aho_match::MATCH_AHO
294                && has_full_match_coverage(m)
295                && m.license_expression != container.license_expression
296                && m.overlaps_with(&container_qspan_set)
297        })
298        .collect();
299
300    if children.len() < 2 {
301        return false;
302    }
303
304    let unique_expressions: HashSet<&str> = children
305        .iter()
306        .map(|m| m.license_expression.as_str())
307        .collect();
308    if unique_expressions.len() < 2 {
309        return false;
310    }
311
312    let mut child_union = PositionSet::new();
313    for m in &children {
314        child_union.extend_from_span(m.query_span());
315    }
316
317    let container_only_positions = container_qspan_set.difference(&child_union);
318    let child_only_positions = child_union.difference(&container_qspan_set);
319
320    let mut sorted_children = children;
321    sorted_children.sort_by_key(|m| m.qspan_bounds());
322
323    let mut bridge_positions = BitSet::new();
324    for pair in sorted_children.windows(2) {
325        let (_, previous_end) = pair[0].qspan_bounds();
326        let (next_start, _) = pair[1].qspan_bounds();
327        for pos in previous_end..next_start {
328            bridge_positions.insert(pos);
329        }
330    }
331
332    let container_only_boundary_positions = container_only_positions
333        .iter()
334        .filter(|&pos| !bridge_positions.contains(pos))
335        .count();
336
337    child_only_positions.is_empty()
338        && container_only_positions.len() <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
339        && container_only_boundary_positions <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
340}
341
342fn filter_redundant_low_coverage_composite_seq_wrappers(
343    seq_matches: Vec<LicenseMatch>,
344    candidate_contained_matches: &[LicenseMatch],
345) -> Vec<LicenseMatch> {
346    seq_matches
347        .into_iter()
348        .filter(|m| {
349            !is_redundant_low_coverage_composite_seq_wrapper(m, candidate_contained_matches)
350        })
351        .collect()
352}
353
354fn subtract_spdx_match_qspans(
355    query: &mut Query<'_>,
356    matched_qspans: &mut Vec<models::PositionSpan>,
357    aho_extra_matchables: &mut PositionSet,
358    spdx_matches: &[LicenseMatch],
359) {
360    for m in spdx_matches {
361        let Some(span) = query_span_for_match(m) else {
362            continue;
363        };
364
365        aho_extra_matchables.extend_from_span(&span);
366        query.subtract(&span);
367
368        if has_full_match_coverage(m) {
369            matched_qspans.push(span);
370        }
371    }
372}
373
374fn merge_and_prepare_aho_matches(
375    index: &index::LicenseIndex,
376    query: &mut Query<'_>,
377    matched_qspans: &mut Vec<models::PositionSpan>,
378    refined_aho: &[LicenseMatch],
379) -> (Vec<LicenseMatch>, bool) {
380    let merged_aho = merge_overlapping_matches(refined_aho);
381    let mut saw_long_exact_license_text_match = false;
382
383    for m in &merged_aho {
384        let Some(span) = query_span_for_match(m) else {
385            continue;
386        };
387
388        if has_full_match_coverage(m) {
389            matched_qspans.push(span.clone());
390        }
391
392        if index
393            .rules_by_rid
394            .get(m.rid)
395            .is_some_and(|rule| rule.is_license_text())
396            && m.rule_length > 120
397            && m.coverage() > 98.0
398        {
399            query.subtract(&span);
400            saw_long_exact_license_text_match = true;
401        }
402    }
403
404    (merged_aho, saw_long_exact_license_text_match)
405}
406
407fn collect_whole_query_exact_followup_matches(
408    index: &index::LicenseIndex,
409    query: &mut Query<'_>,
410    matched_qspans: &mut Vec<models::PositionSpan>,
411    whole_run: &query::QueryRun<'_>,
412    deadline: Option<Instant>,
413) -> Result<Vec<LicenseMatch>> {
414    let mut seq_all_matches = Vec::new();
415
416    if whole_run.is_matchable(false, matched_qspans) {
417        let near_dupe_candidates = if deadline.is_some() {
418            select_seq_candidates_with_deadline(
419                index,
420                whole_run,
421                true,
422                MAX_NEAR_DUPE_CANDIDATES,
423                deadline,
424            )?
425        } else {
426            self::seq_match::select_seq_candidates(index, whole_run, true, MAX_NEAR_DUPE_CANDIDATES)
427        };
428
429        if !near_dupe_candidates.is_empty() {
430            let near_dupe_matches = if deadline.is_some() {
431                seq_match_with_candidates_and_deadline(
432                    index,
433                    whole_run,
434                    &near_dupe_candidates,
435                    deadline,
436                )?
437            } else {
438                self::seq_match::seq_match_with_candidates(index, whole_run, &near_dupe_candidates)
439            };
440
441            for m in &near_dupe_matches {
442                if !m.query_span().is_empty() {
443                    let span = m.query_span().clone();
444                    query.subtract(&span);
445                    matched_qspans.push(span);
446                }
447            }
448
449            seq_all_matches.extend(near_dupe_matches);
450        }
451    }
452
453    Ok(seq_all_matches)
454}
455
456fn collect_regular_seq_matches(
457    index: &index::LicenseIndex,
458    query: &Query<'_>,
459    matched_qspans: &[models::PositionSpan],
460    candidate_contained_matches: &[LicenseMatch],
461    deadline: Option<Instant>,
462) -> Result<Vec<LicenseMatch>> {
463    let mut seq_all_matches = Vec::new();
464
465    for (query_run_index, query_run) in query.query_runs().into_iter().enumerate() {
466        if query_run_index % 8 == 0 {
467            ensure_within_deadline(deadline)?;
468        }
469
470        if !query_run.is_matchable(false, matched_qspans) {
471            continue;
472        }
473
474        let candidates = if deadline.is_some() {
475            select_seq_candidates_with_deadline(
476                index,
477                &query_run,
478                false,
479                MAX_REGULAR_SEQ_CANDIDATES,
480                deadline,
481            )?
482        } else {
483            self::seq_match::select_seq_candidates(
484                index,
485                &query_run,
486                false,
487                MAX_REGULAR_SEQ_CANDIDATES,
488            )
489        };
490        if !candidates.is_empty() {
491            let matches = if deadline.is_some() {
492                seq_match_with_candidates_and_deadline(index, &query_run, &candidates, deadline)?
493            } else {
494                self::seq_match::seq_match_with_candidates(index, &query_run, &candidates)
495            };
496            seq_all_matches.extend(matches);
497        }
498    }
499
500    let merged_seq = merge_overlapping_matches(&seq_all_matches);
501    let filtered_same_expression =
502        filter_redundant_same_expression_seq_containers(merged_seq, candidate_contained_matches);
503    Ok(filter_redundant_low_coverage_composite_seq_wrappers(
504        filtered_same_expression,
505        candidate_contained_matches,
506    ))
507}
508
509impl LicenseDetectionEngine {
510    /// Create a new license detection engine from a pre-built license index.
511    ///
512    /// This is an internal constructor used by `from_directory()` and `from_embedded()`.
513    /// It builds the SPDX mapping from the licenses in the index.
514    fn from_index(
515        index: index::LicenseIndex,
516        spdx_license_list_version: Option<String>,
517        license_index_provenance: Option<LicenseIndexProvenance>,
518    ) -> Result<Self> {
519        let mut license_vec: Vec<_> = index.licenses_by_key.values().cloned().collect();
520        license_vec.sort_by(|a, b| a.key.cmp(&b.key));
521        let spdx_mapping = build_spdx_mapping(&license_vec);
522
523        Ok(Self {
524            index: Arc::new(index),
525            spdx_mapping,
526            spdx_license_list_version,
527            license_index_provenance,
528        })
529    }
530
531    #[cfg(test)]
532    pub(crate) fn from_test_index(index: index::LicenseIndex) -> Self {
533        Self::from_index(index, None, None).expect("test index should build license engine")
534    }
535
536    /// Create a new license detection engine from the embedded license index.
537    ///
538    /// Convenience method that uses the default Provenant cache root and does
539    /// not force a reindex.
540    pub fn from_embedded() -> Result<Self> {
541        let cache_config =
542            LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
543        Self::from_embedded_with_cache(&cache_config)
544    }
545
546    /// Create a new license detection engine from the embedded license index.
547    ///
548    /// This method loads the build-time embedded license artifact and constructs
549    /// the runtime license index. This eliminates the runtime dependency on the
550    /// ScanCode rules directory.
551    ///
552    /// If a valid cache exists (matching fingerprint), the index is loaded from
553    /// the rkyv cache file instead of being rebuilt from scratch.
554    ///
555    /// # Arguments
556    /// * `cache_config` - Cache configuration (directory and reindex flag)
557    ///
558    /// # Returns
559    /// A Result containing the engine or an error
560    pub fn from_embedded_with_cache(cache_config: &LicenseCacheConfig) -> Result<Self> {
561        let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
562        let fingerprint = compute_artifact_fingerprint(artifact_bytes);
563        let artifact_metadata = load_embedded_artifact_metadata_from_bytes(artifact_bytes)
564            .map_err(|e| {
565                anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
566            })?;
567        debug_assert_eq!(
568            artifact_metadata.license_index_provenance.source,
569            EMBEDDED_LICENSE_INDEX_SOURCE
570        );
571        let spdx_version = Some(artifact_metadata.spdx_license_list_version.clone());
572        let provenance = Some(artifact_metadata.license_index_provenance.clone());
573
574        if !cache_config.reindex {
575            if let Some(cached) =
576                load_cached_index(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?
577            {
578                let start = Instant::now();
579                eprintln!(
580                    "License index loaded from rkyv cache in {:.2}s",
581                    start.elapsed().as_secs_f64()
582                );
583                return Self::from_index(cached, spdx_version, provenance);
584            }
585        } else {
586            delete_cache(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?;
587        }
588
589        let snapshot = load_loader_snapshot_from_bytes(artifact_bytes)
590            .map_err(|e| anyhow::anyhow!("Failed to load embedded license index: {}", e))?;
591        let spdx_version = Some(snapshot.metadata.spdx_license_list_version.clone());
592        let provenance = Some(snapshot.metadata.license_index_provenance.clone());
593
594        let start = Instant::now();
595        let index = build_index_from_loaded(snapshot.rules, snapshot.licenses, false);
596        eprintln!(
597            "License index built from embedded artifact in {:.2}s",
598            start.elapsed().as_secs_f64()
599        );
600
601        let mut index = index;
602        index.spdx_license_list_version = spdx_version.clone();
603        if let Err(e) = save_cached_index(
604            cache_config,
605            LicenseCacheNamespace::Embedded,
606            &index,
607            &fingerprint,
608        ) {
609            eprintln!("Warning: failed to save license index cache: {}", e);
610        } else if let Some(size) =
611            cache_file_size(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)
612        {
613            eprintln!(
614                "License index cache saved ({:.1} MB)",
615                size as f64 / 1_048_576.0
616            );
617        }
618
619        Self::from_index(index, spdx_version, provenance)
620    }
621
622    /// Create a new license detection engine from a license dataset root.
623    ///
624    /// Convenience method that uses the default Provenant cache root and does
625    /// not force a reindex.
626    pub fn from_directory(rules_path: &Path) -> Result<Self> {
627        let cache_config =
628            LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
629        Self::from_directory_with_cache(rules_path, &cache_config)
630    }
631
632    /// Create a new license detection engine from a directory of license rules.
633    ///
634    /// If a valid cache exists (matching fingerprint of the dataset), the index is
635    /// loaded from the rkyv cache file instead of being rebuilt from scratch.
636    ///
637    /// # Arguments
638    /// * `rules_path` - Path to dataset root containing rules/ and licenses/
639    /// * `cache_config` - Cache configuration (directory and reindex flag)
640    ///
641    /// # Returns
642    /// A Result containing the engine or an error
643    pub fn from_directory_with_cache(
644        rules_path: &Path,
645        cache_config: &LicenseCacheConfig,
646    ) -> Result<Self> {
647        let LoadedLicenseDataset {
648            manifest,
649            rules: loaded_rules,
650            licenses: loaded_licenses,
651        } = load_license_dataset_from_root(rules_path)?;
652
653        let fingerprint = compute_rules_fingerprint(&loaded_rules, &loaded_licenses)?;
654        let provenance = Some(LicenseIndexProvenance {
655            source: CUSTOM_LICENSE_DATASET_SOURCE.to_string(),
656            dataset_fingerprint: compute_dataset_fingerprint_string(
657                &loaded_rules,
658                &loaded_licenses,
659            )?,
660            ignored_rules: vec![],
661            ignored_licenses: vec![],
662            ignored_rules_due_to_licenses: vec![],
663            added_rules: vec![],
664            replaced_rules: vec![],
665            added_licenses: vec![],
666            replaced_licenses: vec![],
667        });
668
669        if !cache_config.reindex {
670            if let Some(cached) = load_cached_index(
671                cache_config,
672                LicenseCacheNamespace::CustomRules,
673                &fingerprint,
674            )? {
675                let start = Instant::now();
676                eprintln!(
677                    "License index loaded from rkyv cache in {:.2}s",
678                    start.elapsed().as_secs_f64()
679                );
680                return Self::from_index(
681                    cached,
682                    Some(manifest.spdx_license_list_version),
683                    provenance,
684                );
685            }
686        } else {
687            delete_cache(
688                cache_config,
689                LicenseCacheNamespace::CustomRules,
690                &fingerprint,
691            )?;
692        }
693
694        let start = Instant::now();
695        let index = build_index_from_loaded(loaded_rules, loaded_licenses, false);
696        eprintln!(
697            "License index built from custom dataset in {:.2}s",
698            start.elapsed().as_secs_f64()
699        );
700
701        if let Err(e) = save_cached_index(
702            cache_config,
703            LicenseCacheNamespace::CustomRules,
704            &index,
705            &fingerprint,
706        ) {
707            eprintln!("Warning: failed to save license index cache: {}", e);
708        } else if let Some(size) = cache_file_size(
709            cache_config,
710            LicenseCacheNamespace::CustomRules,
711            &fingerprint,
712        ) {
713            eprintln!(
714                "License index cache saved ({:.1} MB)",
715                size as f64 / 1_048_576.0
716            );
717        }
718
719        Self::from_index(index, Some(manifest.spdx_license_list_version), provenance)
720    }
721
722    pub fn embedded_spdx_license_list_version() -> Result<String> {
723        let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
724        Ok(load_embedded_artifact_metadata_from_bytes(artifact_bytes)
725            .map_err(|e| {
726                anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
727            })?
728            .spdx_license_list_version)
729    }
730
731    pub fn detect_with_kind(
732        &self,
733        text: &str,
734        unknown_licenses: bool,
735        binary_derived: bool,
736    ) -> Result<Vec<LicenseDetection>> {
737        self.detect_with_kind_with_score_and_deadline(
738            text,
739            unknown_licenses,
740            binary_derived,
741            0.0,
742            None,
743        )
744    }
745
746    pub fn detect_with_kind_with_score(
747        &self,
748        text: &str,
749        unknown_licenses: bool,
750        binary_derived: bool,
751        min_score: f32,
752    ) -> Result<Vec<LicenseDetection>> {
753        self.detect_with_kind_with_score_and_deadline(
754            text,
755            unknown_licenses,
756            binary_derived,
757            min_score,
758            None,
759        )
760    }
761
762    pub(crate) fn detect_with_kind_with_score_and_deadline(
763        &self,
764        text: &str,
765        unknown_licenses: bool,
766        binary_derived: bool,
767        min_score: f32,
768        deadline: Option<Instant>,
769    ) -> Result<Vec<LicenseDetection>> {
770        ensure_within_deadline(deadline)?;
771        let clean_text = strip_utf8_bom_str(text);
772
773        let content = truncate_detection_text(clean_text);
774
775        ensure_within_deadline(deadline)?;
776        let mut query = if deadline.is_some() {
777            Query::from_extracted_text_with_deadline(
778                content,
779                &self.index,
780                binary_derived,
781                deadline,
782            )?
783        } else {
784            Query::from_extracted_text(content, &self.index, binary_derived)?
785        };
786        let whole_query_run = query.whole_query_run();
787
788        let mut all_matches = Vec::new();
789        let mut candidate_contained_matches = Vec::new();
790        let mut aho_extra_matchables = PositionSet::new();
791        let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
792
793        // Phase 1a: Hash matching
794        // Python returns immediately if hash matches found (index.py:987-991)
795        {
796            ensure_within_deadline(deadline)?;
797            let hash_matches = hash_match(&self.index, &whole_query_run);
798
799            if !hash_matches.is_empty() {
800                let mut matches = hash_matches;
801                sort_matches_by_line(&mut matches);
802
803                let groups = group_matches_by_region(&matches);
804                let detections: Vec<LicenseDetection> = groups
805                    .iter()
806                    .map(|group| {
807                        let mut detection = empty_detection();
808                        populate_detection_from_group_with_spdx(
809                            &mut detection,
810                            group,
811                            &self.spdx_mapping,
812                            Some(content),
813                        );
814                        detection
815                    })
816                    .collect();
817
818                return Ok(post_process_detections(detections, min_score));
819            }
820        }
821
822        // Phase 1b: SPDX-LID matching
823        {
824            ensure_within_deadline(deadline)?;
825            let spdx_matches = spdx_lid_match(&self.index, &query);
826            subtract_spdx_match_qspans(
827                &mut query,
828                &mut matched_qspans,
829                &mut aho_extra_matchables,
830                &spdx_matches,
831            );
832            all_matches.extend(spdx_matches);
833        }
834
835        // Phase 1c: Aho-Corasick matching
836        {
837            ensure_within_deadline(deadline)?;
838            let aho_matches = if aho_extra_matchables.is_empty() {
839                if deadline.is_some() {
840                    aho_match::aho_match_with_deadline(&self.index, &whole_query_run, deadline)?
841                } else {
842                    aho_match(&self.index, &whole_query_run)
843                }
844            } else {
845                if deadline.is_some() {
846                    aho_match::aho_match_with_extra_matchables(
847                        &self.index,
848                        &whole_query_run,
849                        Some(&aho_extra_matchables),
850                        deadline,
851                    )?
852                } else {
853                    aho_match::aho_match_with_extra_matchables(
854                        &self.index,
855                        &whole_query_run,
856                        Some(&aho_extra_matchables),
857                        None,
858                    )?
859                }
860            };
861
862            // Python's get_exact_matches() calls refine_matches with merge=False
863            // This applies quality filters including required phrase filtering
864            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
865            candidate_contained_matches.extend(refined_aho.clone());
866            let (merged_aho, _) = merge_and_prepare_aho_matches(
867                &self.index,
868                &mut query,
869                &mut matched_qspans,
870                &refined_aho,
871            );
872            all_matches.extend(merged_aho);
873
874            let whole_query_followup = collect_whole_query_exact_followup_matches(
875                &self.index,
876                &mut query,
877                &mut matched_qspans,
878                &whole_query_run,
879                deadline,
880            )?;
881            all_matches.extend(whole_query_followup);
882
883            let merged_seq = collect_regular_seq_matches(
884                &self.index,
885                &query,
886                &matched_qspans,
887                &candidate_contained_matches,
888                deadline,
889            )?;
890            all_matches.extend(merged_seq);
891        }
892
893        // Step 1: Initial refine WITHOUT false positive filtering
894        // Python: refine_matches with filter_false_positive=False (index.py:1073-1080)
895        ensure_within_deadline(deadline)?;
896        let merged_matches =
897            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
898
899        // Step 2: Unknown detection and weak match handling
900        // Python: index.py:1079-1118 - only runs when unknown_licenses=True
901        let refined_matches = if unknown_licenses {
902            // Split weak from good - Python: index.py:1083
903            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
904
905            // Unknown detection on uncovered regions - Python: index.py:1093-1114
906            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
907            let filtered_unknown =
908                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
909
910            let mut all_matches = good_matches;
911            all_matches.extend(filtered_unknown);
912            // reinject weak matches and let refine matches keep the bests
913            // Python: index.py:1117-1118
914            all_matches.extend(weak_matches);
915            all_matches
916        } else {
917            merged_matches
918        };
919
920        // Step 5: Final refine WITH false positive filtering - Python: index.py:1130-1145
921        ensure_within_deadline(deadline)?;
922        let refined = refine_matches(&self.index, refined_matches, &query);
923
924        let mut sorted = refined;
925        sort_matches_by_line(&mut sorted);
926
927        let groups = group_matches_by_region(&sorted);
928
929        let detections: Vec<LicenseDetection> = groups
930            .iter()
931            .map(|group| {
932                let mut detection = empty_detection();
933                populate_detection_from_group_with_spdx(
934                    &mut detection,
935                    group,
936                    &self.spdx_mapping,
937                    Some(content),
938                );
939                detection
940            })
941            .collect();
942
943        let detections = post_process_detections(detections, min_score);
944
945        ensure_within_deadline(deadline)?;
946        Ok(detections)
947    }
948
949    pub fn detect_with_kind_and_source(
950        &self,
951        text: &str,
952        unknown_licenses: bool,
953        binary_derived: bool,
954        source_path: &str,
955    ) -> Result<Vec<LicenseDetection>> {
956        self.detect_with_kind_and_source_with_deadline(
957            text,
958            unknown_licenses,
959            binary_derived,
960            source_path,
961            None,
962        )
963    }
964
965    pub(crate) fn detect_with_kind_and_source_with_deadline(
966        &self,
967        text: &str,
968        unknown_licenses: bool,
969        binary_derived: bool,
970        source_path: &str,
971        deadline: Option<Instant>,
972    ) -> Result<Vec<LicenseDetection>> {
973        let mut detections = self.detect_with_kind_with_score_and_deadline(
974            text,
975            unknown_licenses,
976            binary_derived,
977            0.0,
978            deadline,
979        )?;
980        attach_source_path_to_detections(&mut detections, source_path);
981        Ok(detections)
982    }
983
984    pub fn detect_with_kind_and_source_with_score(
985        &self,
986        text: &str,
987        unknown_licenses: bool,
988        binary_derived: bool,
989        source_path: &str,
990        min_score: f32,
991    ) -> Result<Vec<LicenseDetection>> {
992        let mut detections =
993            self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, min_score)?;
994        attach_source_path_to_detections(&mut detections, source_path);
995        Ok(detections)
996    }
997
998    pub(crate) fn detect_with_kind_and_source_with_score_and_deadline(
999        &self,
1000        text: &str,
1001        unknown_licenses: bool,
1002        binary_derived: bool,
1003        source_path: &str,
1004        min_score: f32,
1005        deadline: Option<Instant>,
1006    ) -> Result<Vec<LicenseDetection>> {
1007        let mut detections = self.detect_with_kind_with_score_and_deadline(
1008            text,
1009            unknown_licenses,
1010            binary_derived,
1011            min_score,
1012            deadline,
1013        )?;
1014        attach_source_path_to_detections(&mut detections, source_path);
1015        Ok(detections)
1016    }
1017
1018    /// Detect licenses and return raw matches (like Python's idx.match()).
1019    ///
1020    /// This is primarily used by golden tests and maintenance tooling that need
1021    /// raw match sequences before grouping or post-processing into detections.
1022    #[cfg(any(test, feature = "golden-tests"))]
1023    pub fn detect_matches_with_kind(
1024        &self,
1025        text: &str,
1026        unknown_licenses: bool,
1027        binary_derived: bool,
1028    ) -> Result<Vec<LicenseMatch>> {
1029        let clean_text = strip_utf8_bom_str(text);
1030
1031        let content = truncate_detection_text(clean_text);
1032
1033        let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
1034        let whole_query_run = query.whole_query_run();
1035
1036        let mut all_matches = Vec::new();
1037        let mut candidate_contained_matches = Vec::new();
1038        let mut aho_extra_matchables = PositionSet::new();
1039        let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
1040
1041        // Phase 1a: Hash matching
1042        {
1043            let hash_matches = hash_match(&self.index, &whole_query_run);
1044
1045            if !hash_matches.is_empty() {
1046                let mut matches = hash_matches;
1047                sort_matches_by_line(&mut matches);
1048                return Ok(matches);
1049            }
1050        }
1051
1052        // Phase 1b: SPDX-LID matching
1053        {
1054            let spdx_matches = spdx_lid_match(&self.index, &query);
1055            subtract_spdx_match_qspans(
1056                &mut query,
1057                &mut matched_qspans,
1058                &mut aho_extra_matchables,
1059                &spdx_matches,
1060            );
1061            all_matches.extend(spdx_matches);
1062        }
1063
1064        // Phase 1c: Aho-Corasick matching
1065        {
1066            let aho_matches = if aho_extra_matchables.is_empty() {
1067                aho_match(&self.index, &whole_query_run)
1068            } else {
1069                aho_match::aho_match_with_extra_matchables(
1070                    &self.index,
1071                    &whole_query_run,
1072                    Some(&aho_extra_matchables),
1073                    None,
1074                )?
1075            };
1076            let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
1077            candidate_contained_matches.extend(refined_aho.clone());
1078            let (merged_aho, _) = merge_and_prepare_aho_matches(
1079                &self.index,
1080                &mut query,
1081                &mut matched_qspans,
1082                &refined_aho,
1083            );
1084            all_matches.extend(merged_aho);
1085
1086            let whole_query_followup = collect_whole_query_exact_followup_matches(
1087                &self.index,
1088                &mut query,
1089                &mut matched_qspans,
1090                &whole_query_run,
1091                None,
1092            )?;
1093            all_matches.extend(whole_query_followup);
1094
1095            let merged_seq = collect_regular_seq_matches(
1096                &self.index,
1097                &query,
1098                &matched_qspans,
1099                &candidate_contained_matches,
1100                None,
1101            )?;
1102            all_matches.extend(merged_seq);
1103        }
1104
1105        // Step 1: Initial refine WITHOUT false positive filtering
1106        let merged_matches =
1107            refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
1108
1109        // Step 2: Unknown detection and weak match handling
1110        let refined_matches = if unknown_licenses {
1111            let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
1112            let unknown_matches = unknown_match(&self.index, &query, &good_matches);
1113            let filtered_unknown =
1114                filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
1115
1116            let mut all_matches = good_matches;
1117            all_matches.extend(filtered_unknown);
1118            all_matches.extend(weak_matches);
1119            all_matches
1120        } else {
1121            merged_matches
1122        };
1123
1124        // Step 3: Final refine WITH false positive filtering - Python: index.py:1130-1145
1125        let refined = refine_matches(&self.index, refined_matches, &query);
1126
1127        let mut sorted = refined;
1128        sort_matches_by_line(&mut sorted);
1129
1130        // Return raw matches (NOT grouped) - this is Python's idx.match() behavior
1131        Ok(sorted)
1132    }
1133
1134    /// Get a reference to the license index.
1135    pub fn index(&self) -> &index::LicenseIndex {
1136        &self.index
1137    }
1138
1139    pub fn spdx_license_list_version(&self) -> Option<&str> {
1140        self.spdx_license_list_version.as_deref()
1141    }
1142
1143    pub fn license_index_provenance(&self) -> Option<&LicenseIndexProvenance> {
1144        self.license_index_provenance.as_ref()
1145    }
1146
1147    /// Get a reference to the SPDX mapping.
1148    #[cfg(test)]
1149    pub fn spdx_mapping(&self) -> &SpdxMapping {
1150        &self.spdx_mapping
1151    }
1152}
1153
1154pub fn detect_scancode_spdx_license_list_version(search_path: &Path) -> Result<Option<String>> {
1155    for ancestor in search_path.ancestors() {
1156        let candidate = ancestor.join("scancode_config.py");
1157        if candidate.is_file() {
1158            let config = fs::read_to_string(&candidate)?;
1159            return Ok(parse_scancode_spdx_license_list_version(&config));
1160        }
1161    }
1162
1163    Ok(None)
1164}
1165
1166fn parse_scancode_spdx_license_list_version(config: &str) -> Option<String> {
1167    config.lines().find_map(|line| {
1168        let trimmed = line.trim();
1169        let (_, value) = trimmed.split_once('=')?;
1170        (trimmed.starts_with("spdx_license_list_version")).then(|| {
1171            value
1172                .trim()
1173                .trim_matches('"')
1174                .trim_matches('\'')
1175                .to_string()
1176        })
1177    })
1178}
1179
1180#[cfg(test)]
1181mod tests;