1pub mod aho_match;
7pub mod automaton;
8pub mod build_policy;
9pub mod dataset;
10pub(crate) mod detection;
11pub mod embedded;
12pub mod license_cache;
13mod position_set;
14mod token_multiset;
15mod token_set;
16
17#[cfg(test)]
18mod embedded_test;
19pub mod expression;
20#[cfg(feature = "golden-tests")]
21pub mod golden_utils;
22pub mod hash_match;
23pub mod index;
24mod match_refine;
25pub mod models;
26pub mod query;
27pub mod rules;
28pub mod seq_match;
29pub mod spdx_lid;
30pub mod spdx_mapping;
31#[cfg(test)]
32mod test_utils;
33pub mod tokenize;
34pub mod unknown_match;
35
36use bit_set::BitSet;
37use std::collections::HashSet;
38use std::fs;
39use std::path::Path;
40use std::sync::Arc;
41use std::time::Instant;
42
43use anyhow::Result;
44
45use crate::license_detection::build_policy::EMBEDDED_LICENSE_INDEX_SOURCE;
46use crate::license_detection::dataset::{
47 CUSTOM_LICENSE_DATASET_SOURCE, LoadedLicenseDataset, compute_dataset_fingerprint_string,
48 load_license_dataset_from_root,
49};
50use crate::license_detection::embedded::index::{
51 load_embedded_artifact_metadata_from_bytes, load_loader_snapshot_from_bytes,
52};
53use crate::license_detection::index::build_index_from_loaded;
54use crate::license_detection::license_cache::{
55 LicenseCacheConfig, LicenseCacheNamespace, cache_file_size, compute_artifact_fingerprint,
56 compute_rules_fingerprint, delete_cache, load_cached_index, save_cached_index,
57};
58use crate::license_detection::query::Query;
59use crate::license_detection::spdx_mapping::{SpdxMapping, build_spdx_mapping};
60use crate::models::LicenseIndexProvenance;
61use crate::utils::text::strip_utf8_bom_str;
62
63use crate::license_detection::detection::{
64 attach_source_path_to_detections, empty_detection, populate_detection_from_group_with_spdx,
65 split_groups_across_frontmatter_boundary,
66};
67use crate::license_detection::models::MatcherKind;
68
69#[allow(dead_code)]
72pub const SCANCODE_LICENSES_RULES_PATH: &str =
73 "reference/scancode-toolkit/src/licensedcode/data/rules";
74
75#[allow(dead_code)]
78pub const SCANCODE_LICENSES_LICENSES_PATH: &str =
79 "reference/scancode-toolkit/src/licensedcode/data/licenses";
80
81#[allow(dead_code)]
84pub const SCANCODE_LICENSES_DATA_PATH: &str = "reference/scancode-toolkit/src/licensedcode/data";
85
86pub const DEFAULT_LICENSEDB_URL_TEMPLATE: &str = "https://scancode-licensedb.aboutcode.org/{}";
87pub(crate) const LICENSE_DETECTION_TIMEOUT_MESSAGE: &str = "license detection timed out";
88
89pub(crate) use detection::{
90 LicenseDetection, group_matches_by_region, post_process_detections, sort_matches_by_line,
91};
92pub use models::LicenseMatch;
93
94pub use aho_match::aho_match;
95pub use hash_match::hash_match;
96pub use match_refine::{
97 filter_invalid_contained_unknown_matches, merge_overlapping_matches, refine_matches,
98 refine_matches_without_false_positive_filter, split_weak_matches,
99};
100pub use position_set::PositionSet;
101pub use spdx_lid::spdx_lid_match;
102pub use token_multiset::TokenMultiset;
103pub use token_set::TokenSet;
104pub use unknown_match::unknown_match;
105
106use self::seq_match::{
107 MAX_NEAR_DUPE_CANDIDATES, select_seq_candidates_with_deadline,
108 seq_match_with_candidates_and_deadline,
109};
110
111#[derive(Debug, Clone)]
117pub struct LicenseDetectionEngine {
118 index: Arc<index::LicenseIndex>,
119 spdx_mapping: SpdxMapping,
120 spdx_license_list_version: Option<String>,
121 license_index_provenance: Option<LicenseIndexProvenance>,
122}
123
124const MAX_DETECTION_SIZE: usize = 10 * 1024 * 1024; const MAX_REGULAR_SEQ_CANDIDATES: usize = 70;
126const MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP: usize = 8;
127const MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP: usize = 2;
128
129pub(crate) fn deadline_exceeded(deadline: Option<Instant>) -> bool {
130 deadline.is_some_and(|deadline| Instant::now() >= deadline)
131}
132
133pub(crate) fn ensure_within_deadline(deadline: Option<Instant>) -> Result<()> {
134 if deadline_exceeded(deadline) {
135 Err(anyhow::anyhow!(LICENSE_DETECTION_TIMEOUT_MESSAGE))
136 } else {
137 Ok(())
138 }
139}
140
141fn truncate_detection_text(clean_text: &str) -> &str {
142 if clean_text.len() <= MAX_DETECTION_SIZE {
143 return clean_text;
144 }
145
146 log::debug!(
147 "Content size {} exceeds limit {}, truncating for detection",
148 clean_text.len(),
149 MAX_DETECTION_SIZE
150 );
151
152 let boundary = clean_text.floor_char_boundary(MAX_DETECTION_SIZE);
153 &clean_text[..boundary]
154}
155
156fn query_span_for_match(m: &LicenseMatch) -> Option<models::PositionSpan> {
157 (!m.query_span().is_empty()).then(|| m.query_span().clone())
158}
159
160fn has_full_match_coverage(m: &LicenseMatch) -> bool {
161 m.coverage() == 100.0
162}
163
164fn is_redundant_same_expression_seq_container(
165 container: &LicenseMatch,
166 candidate_contained_matches: &[LicenseMatch],
167) -> bool {
168 let container_is_redundant_coverage =
169 has_full_match_coverage(container) || container.coverage() >= 99.0;
170 if container.matcher != MatcherKind::Seq || !container_is_redundant_coverage {
171 return false;
172 }
173
174 let container_qspan_set = container.qspan_set();
175
176 let mut contained: Vec<&LicenseMatch> = candidate_contained_matches
177 .iter()
178 .filter(|m| {
179 m.matcher == MatcherKind::Aho
180 && has_full_match_coverage(m)
181 && m.license_expression == container.license_expression
182 && m.overlaps_with(&container_qspan_set)
183 })
184 .collect();
185
186 if contained.len() < 2 {
187 return false;
188 }
189
190 let material_children = contained.iter().filter(|m| m.matched_length > 1).count();
191 if material_children < 2 {
192 return false;
193 }
194
195 contained.sort_by_key(|m| m.qspan_bounds());
196
197 let mut child_union = PositionSet::new();
198 for m in &contained {
199 child_union.extend_from_span(m.query_span());
200 }
201
202 let container_only_positions = container_qspan_set.difference(&child_union);
203 let child_only_positions = child_union.difference(&container_qspan_set);
204
205 let mut bridge_positions = BitSet::new();
206 for pair in contained.windows(2) {
207 let (_, previous_end) = pair[0].qspan_bounds();
208 let (next_start, _) = pair[1].qspan_bounds();
209
210 if next_start < previous_end {
211 return false;
212 }
213
214 for pos in previous_end..next_start {
215 bridge_positions.insert(pos);
216 }
217 }
218
219 let container_only_boundary_positions = container_only_positions
220 .iter()
221 .filter(|&pos| !bridge_positions.contains(pos))
222 .count();
223
224 if container_only_positions.len() == 1
225 && container_only_boundary_positions == 0
226 && child_only_positions.is_empty()
227 {
228 return false;
229 }
230
231 if child_only_positions.is_empty()
232 && container_only_positions.len() == container_only_boundary_positions
233 && container_only_boundary_positions <= 3
234 {
235 let earliest_child = contained
236 .iter()
237 .map(|m| m.qspan_bounds().0)
238 .min()
239 .unwrap_or(usize::MAX);
240 let latest_child = contained
241 .iter()
242 .map(|m| m.qspan_bounds().1.saturating_sub(1))
243 .max()
244 .unwrap_or(0);
245
246 let is_one_sided_boundary = container_only_positions
247 .iter()
248 .all(|pos| pos < earliest_child)
249 || container_only_positions
250 .iter()
251 .all(|pos| pos > latest_child);
252
253 if is_one_sided_boundary {
254 return false;
255 }
256 }
257
258 let max_container_only_positions =
259 MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * contained.len() + 1;
260 let max_container_boundary_positions =
261 MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * (contained.len() - 1);
262 let max_child_only_positions = MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP + 1;
263
264 container_only_positions.len() <= max_container_only_positions
265 && container_only_boundary_positions <= max_container_boundary_positions
266 && child_only_positions.len() <= max_child_only_positions
267}
268
269fn filter_redundant_same_expression_seq_containers(
270 seq_matches: Vec<LicenseMatch>,
271 candidate_contained_matches: &[LicenseMatch],
272) -> Vec<LicenseMatch> {
273 seq_matches
274 .into_iter()
275 .filter(|m| !is_redundant_same_expression_seq_container(m, candidate_contained_matches))
276 .collect()
277}
278
279fn is_redundant_low_coverage_composite_seq_wrapper(
280 container: &LicenseMatch,
281 candidate_contained_matches: &[LicenseMatch],
282) -> bool {
283 if container.matcher != seq_match::MATCH_SEQ || container.coverage() >= 30.0 {
284 return false;
285 }
286
287 let container_qspan_set = container.qspan_set();
288
289 let children: Vec<&LicenseMatch> = candidate_contained_matches
290 .iter()
291 .filter(|m| {
292 m.matcher == aho_match::MATCH_AHO
293 && has_full_match_coverage(m)
294 && m.license_expression != container.license_expression
295 && m.overlaps_with(&container_qspan_set)
296 })
297 .collect();
298
299 if children.len() < 2 {
300 return false;
301 }
302
303 let unique_expressions: HashSet<&str> = children
304 .iter()
305 .map(|m| m.license_expression.as_str())
306 .collect();
307 if unique_expressions.len() < 2 {
308 return false;
309 }
310
311 let mut child_union = PositionSet::new();
312 for m in &children {
313 child_union.extend_from_span(m.query_span());
314 }
315
316 let container_only_positions = container_qspan_set.difference(&child_union);
317 let child_only_positions = child_union.difference(&container_qspan_set);
318
319 let mut sorted_children = children;
320 sorted_children.sort_by_key(|m| m.qspan_bounds());
321
322 let mut bridge_positions = BitSet::new();
323 for pair in sorted_children.windows(2) {
324 let (_, previous_end) = pair[0].qspan_bounds();
325 let (next_start, _) = pair[1].qspan_bounds();
326 for pos in previous_end..next_start {
327 bridge_positions.insert(pos);
328 }
329 }
330
331 let container_only_boundary_positions = container_only_positions
332 .iter()
333 .filter(|&pos| !bridge_positions.contains(pos))
334 .count();
335
336 child_only_positions.is_empty()
337 && container_only_positions.len() <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
338 && container_only_boundary_positions <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
339}
340
341fn filter_redundant_low_coverage_composite_seq_wrappers(
342 seq_matches: Vec<LicenseMatch>,
343 candidate_contained_matches: &[LicenseMatch],
344) -> Vec<LicenseMatch> {
345 seq_matches
346 .into_iter()
347 .filter(|m| {
348 !is_redundant_low_coverage_composite_seq_wrapper(m, candidate_contained_matches)
349 })
350 .collect()
351}
352
353fn subtract_spdx_match_qspans(
354 query: &mut Query<'_>,
355 matched_qspans: &mut Vec<models::PositionSpan>,
356 aho_extra_matchables: &mut PositionSet,
357 spdx_matches: &[LicenseMatch],
358) {
359 for m in spdx_matches {
360 let Some(span) = query_span_for_match(m) else {
361 continue;
362 };
363
364 aho_extra_matchables.extend_from_span(&span);
365 query.subtract(&span);
366
367 if has_full_match_coverage(m) {
368 matched_qspans.push(span);
369 }
370 }
371}
372
373fn merge_and_prepare_aho_matches(
374 index: &index::LicenseIndex,
375 query: &mut Query<'_>,
376 matched_qspans: &mut Vec<models::PositionSpan>,
377 refined_aho: &[LicenseMatch],
378) -> (Vec<LicenseMatch>, bool) {
379 let merged_aho = merge_overlapping_matches(refined_aho);
380 let mut saw_long_exact_license_text_match = false;
381
382 for m in &merged_aho {
383 let Some(span) = query_span_for_match(m) else {
384 continue;
385 };
386
387 if has_full_match_coverage(m) {
388 matched_qspans.push(span.clone());
389 }
390
391 if index
392 .rules_by_rid
393 .get(m.rid)
394 .is_some_and(|rule| rule.is_license_text())
395 && m.rule_length > 120
396 && m.coverage() > 98.0
397 {
398 query.subtract(&span);
399 saw_long_exact_license_text_match = true;
400 }
401 }
402
403 (merged_aho, saw_long_exact_license_text_match)
404}
405
406fn collect_whole_query_exact_followup_matches(
407 index: &index::LicenseIndex,
408 query: &mut Query<'_>,
409 matched_qspans: &mut Vec<models::PositionSpan>,
410 whole_run: &query::QueryRun<'_>,
411 deadline: Option<Instant>,
412) -> Result<Vec<LicenseMatch>> {
413 let mut seq_all_matches = Vec::new();
414
415 if whole_run.is_matchable(false, matched_qspans) {
416 let near_dupe_candidates = if deadline.is_some() {
417 select_seq_candidates_with_deadline(
418 index,
419 whole_run,
420 true,
421 MAX_NEAR_DUPE_CANDIDATES,
422 deadline,
423 )?
424 } else {
425 self::seq_match::select_seq_candidates(index, whole_run, true, MAX_NEAR_DUPE_CANDIDATES)
426 };
427
428 if !near_dupe_candidates.is_empty() {
429 let near_dupe_matches = if deadline.is_some() {
430 seq_match_with_candidates_and_deadline(
431 index,
432 whole_run,
433 &near_dupe_candidates,
434 deadline,
435 )?
436 } else {
437 self::seq_match::seq_match_with_candidates(index, whole_run, &near_dupe_candidates)
438 };
439
440 for m in &near_dupe_matches {
441 if !m.query_span().is_empty() {
442 let span = m.query_span().clone();
443 query.subtract(&span);
444 matched_qspans.push(span);
445 }
446 }
447
448 seq_all_matches.extend(near_dupe_matches);
449 }
450 }
451
452 Ok(seq_all_matches)
453}
454
455fn collect_regular_seq_matches(
456 index: &index::LicenseIndex,
457 query: &Query<'_>,
458 matched_qspans: &[models::PositionSpan],
459 candidate_contained_matches: &[LicenseMatch],
460 deadline: Option<Instant>,
461) -> Result<Vec<LicenseMatch>> {
462 let mut seq_all_matches = Vec::new();
463
464 for (query_run_index, query_run) in query.query_runs().into_iter().enumerate() {
465 if query_run_index % 8 == 0 {
466 ensure_within_deadline(deadline)?;
467 }
468
469 if !query_run.is_matchable(false, matched_qspans) {
470 continue;
471 }
472
473 let candidates = if deadline.is_some() {
474 select_seq_candidates_with_deadline(
475 index,
476 &query_run,
477 false,
478 MAX_REGULAR_SEQ_CANDIDATES,
479 deadline,
480 )?
481 } else {
482 self::seq_match::select_seq_candidates(
483 index,
484 &query_run,
485 false,
486 MAX_REGULAR_SEQ_CANDIDATES,
487 )
488 };
489 if !candidates.is_empty() {
490 let matches = if deadline.is_some() {
491 seq_match_with_candidates_and_deadline(index, &query_run, &candidates, deadline)?
492 } else {
493 self::seq_match::seq_match_with_candidates(index, &query_run, &candidates)
494 };
495 seq_all_matches.extend(matches);
496 }
497 }
498
499 let merged_seq = merge_overlapping_matches(&seq_all_matches);
500 let filtered_same_expression =
501 filter_redundant_same_expression_seq_containers(merged_seq, candidate_contained_matches);
502 Ok(filter_redundant_low_coverage_composite_seq_wrappers(
503 filtered_same_expression,
504 candidate_contained_matches,
505 ))
506}
507
508impl LicenseDetectionEngine {
509 fn from_index(
514 index: index::LicenseIndex,
515 spdx_license_list_version: Option<String>,
516 license_index_provenance: Option<LicenseIndexProvenance>,
517 ) -> Result<Self> {
518 let mut license_vec: Vec<_> = index.licenses_by_key.values().cloned().collect();
519 license_vec.sort_by(|a, b| a.key.cmp(&b.key));
520 let spdx_mapping = build_spdx_mapping(&license_vec);
521
522 Ok(Self {
523 index: Arc::new(index),
524 spdx_mapping,
525 spdx_license_list_version,
526 license_index_provenance,
527 })
528 }
529
530 #[cfg(test)]
531 pub(crate) fn from_test_index(index: index::LicenseIndex) -> Self {
532 Self::from_index(index, None, None).expect("test index should build license engine")
533 }
534
535 pub fn from_embedded() -> Result<Self> {
540 let cache_config =
541 LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
542 Self::from_embedded_with_cache(&cache_config)
543 }
544
545 pub fn from_embedded_with_cache(cache_config: &LicenseCacheConfig) -> Result<Self> {
560 let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
561 let fingerprint = compute_artifact_fingerprint(artifact_bytes);
562 let artifact_metadata = load_embedded_artifact_metadata_from_bytes(artifact_bytes)
563 .map_err(|e| {
564 anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
565 })?;
566 debug_assert_eq!(
567 artifact_metadata.license_index_provenance.source,
568 EMBEDDED_LICENSE_INDEX_SOURCE
569 );
570 let spdx_version = Some(artifact_metadata.spdx_license_list_version.clone());
571 let provenance = Some(artifact_metadata.license_index_provenance.clone());
572
573 if !cache_config.reindex {
574 if let Some(cached) =
575 load_cached_index(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?
576 {
577 let start = Instant::now();
578 eprintln!(
579 "License index loaded from rkyv cache in {:.2}s",
580 start.elapsed().as_secs_f64()
581 );
582 return Self::from_index(cached, spdx_version, provenance);
583 }
584 } else {
585 delete_cache(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?;
586 }
587
588 let snapshot = load_loader_snapshot_from_bytes(artifact_bytes)
589 .map_err(|e| anyhow::anyhow!("Failed to load embedded license index: {}", e))?;
590 let spdx_version = Some(snapshot.metadata.spdx_license_list_version.clone());
591 let provenance = Some(snapshot.metadata.license_index_provenance.clone());
592
593 let start = Instant::now();
594 let index = build_index_from_loaded(snapshot.rules, snapshot.licenses, false);
595 eprintln!(
596 "License index built from embedded artifact in {:.2}s",
597 start.elapsed().as_secs_f64()
598 );
599
600 let mut index = index;
601 index.spdx_license_list_version = spdx_version.clone();
602 if let Err(e) = save_cached_index(
603 cache_config,
604 LicenseCacheNamespace::Embedded,
605 &index,
606 &fingerprint,
607 ) {
608 eprintln!("Warning: failed to save license index cache: {}", e);
609 } else if let Some(size) =
610 cache_file_size(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)
611 {
612 eprintln!(
613 "License index cache saved ({:.1} MB)",
614 size as f64 / 1_048_576.0
615 );
616 }
617
618 Self::from_index(index, spdx_version, provenance)
619 }
620
621 pub fn from_directory(rules_path: &Path) -> Result<Self> {
626 let cache_config =
627 LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
628 Self::from_directory_with_cache(rules_path, &cache_config)
629 }
630
631 pub fn from_directory_with_cache(
643 rules_path: &Path,
644 cache_config: &LicenseCacheConfig,
645 ) -> Result<Self> {
646 let LoadedLicenseDataset {
647 manifest,
648 rules: loaded_rules,
649 licenses: loaded_licenses,
650 } = load_license_dataset_from_root(rules_path)?;
651
652 let fingerprint = compute_rules_fingerprint(&loaded_rules, &loaded_licenses)?;
653 let provenance = Some(LicenseIndexProvenance {
654 source: CUSTOM_LICENSE_DATASET_SOURCE.to_string(),
655 dataset_fingerprint: compute_dataset_fingerprint_string(
656 &loaded_rules,
657 &loaded_licenses,
658 )?,
659 ignored_rules: vec![],
660 ignored_licenses: vec![],
661 ignored_rules_due_to_licenses: vec![],
662 added_rules: vec![],
663 replaced_rules: vec![],
664 added_licenses: vec![],
665 replaced_licenses: vec![],
666 });
667
668 if !cache_config.reindex {
669 if let Some(cached) = load_cached_index(
670 cache_config,
671 LicenseCacheNamespace::CustomRules,
672 &fingerprint,
673 )? {
674 let start = Instant::now();
675 eprintln!(
676 "License index loaded from rkyv cache in {:.2}s",
677 start.elapsed().as_secs_f64()
678 );
679 return Self::from_index(
680 cached,
681 Some(manifest.spdx_license_list_version),
682 provenance,
683 );
684 }
685 } else {
686 delete_cache(
687 cache_config,
688 LicenseCacheNamespace::CustomRules,
689 &fingerprint,
690 )?;
691 }
692
693 let start = Instant::now();
694 let index = build_index_from_loaded(loaded_rules, loaded_licenses, false);
695 eprintln!(
696 "License index built from custom dataset in {:.2}s",
697 start.elapsed().as_secs_f64()
698 );
699
700 if let Err(e) = save_cached_index(
701 cache_config,
702 LicenseCacheNamespace::CustomRules,
703 &index,
704 &fingerprint,
705 ) {
706 eprintln!("Warning: failed to save license index cache: {}", e);
707 } else if let Some(size) = cache_file_size(
708 cache_config,
709 LicenseCacheNamespace::CustomRules,
710 &fingerprint,
711 ) {
712 eprintln!(
713 "License index cache saved ({:.1} MB)",
714 size as f64 / 1_048_576.0
715 );
716 }
717
718 Self::from_index(index, Some(manifest.spdx_license_list_version), provenance)
719 }
720
721 pub fn embedded_spdx_license_list_version() -> Result<String> {
722 let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
723 Ok(load_embedded_artifact_metadata_from_bytes(artifact_bytes)
724 .map_err(|e| {
725 anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
726 })?
727 .spdx_license_list_version)
728 }
729
730 pub fn detect_with_kind(
731 &self,
732 text: &str,
733 unknown_licenses: bool,
734 binary_derived: bool,
735 ) -> Result<Vec<LicenseDetection>> {
736 self.detect_with_kind_with_score_and_deadline(
737 text,
738 unknown_licenses,
739 binary_derived,
740 0.0,
741 None,
742 )
743 }
744
745 pub fn detect_with_kind_with_score(
746 &self,
747 text: &str,
748 unknown_licenses: bool,
749 binary_derived: bool,
750 min_score: f32,
751 ) -> Result<Vec<LicenseDetection>> {
752 self.detect_with_kind_with_score_and_deadline(
753 text,
754 unknown_licenses,
755 binary_derived,
756 min_score,
757 None,
758 )
759 }
760
761 pub(crate) fn detect_with_kind_with_score_and_deadline(
762 &self,
763 text: &str,
764 unknown_licenses: bool,
765 binary_derived: bool,
766 min_score: f32,
767 deadline: Option<Instant>,
768 ) -> Result<Vec<LicenseDetection>> {
769 ensure_within_deadline(deadline)?;
770 let clean_text = strip_utf8_bom_str(text);
771
772 let content = truncate_detection_text(clean_text);
773
774 ensure_within_deadline(deadline)?;
775 let mut query = if deadline.is_some() {
776 Query::from_extracted_text_with_deadline(
777 content,
778 &self.index,
779 binary_derived,
780 deadline,
781 )?
782 } else {
783 Query::from_extracted_text(content, &self.index, binary_derived)?
784 };
785 let whole_query_run = query.whole_query_run();
786
787 let mut all_matches = Vec::new();
788 let mut candidate_contained_matches = Vec::new();
789 let mut aho_extra_matchables = PositionSet::new();
790 let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
791
792 {
795 ensure_within_deadline(deadline)?;
796 let hash_matches = hash_match(&self.index, &whole_query_run);
797
798 if !hash_matches.is_empty() {
799 let mut matches = hash_matches;
800 sort_matches_by_line(&mut matches);
801
802 let groups = split_groups_across_frontmatter_boundary(
803 group_matches_by_region(&matches),
804 Some(content),
805 );
806 let detections: Vec<LicenseDetection> = groups
807 .iter()
808 .map(|group| {
809 let mut detection = empty_detection();
810 populate_detection_from_group_with_spdx(
811 &mut detection,
812 group,
813 &self.spdx_mapping,
814 Some(content),
815 );
816 detection
817 })
818 .collect();
819
820 return Ok(post_process_detections(detections, min_score));
821 }
822 }
823
824 {
826 ensure_within_deadline(deadline)?;
827 let spdx_matches = spdx_lid_match(&self.index, &query);
828 subtract_spdx_match_qspans(
829 &mut query,
830 &mut matched_qspans,
831 &mut aho_extra_matchables,
832 &spdx_matches,
833 );
834 all_matches.extend(spdx_matches);
835 }
836
837 {
839 ensure_within_deadline(deadline)?;
840 let aho_matches = if aho_extra_matchables.is_empty() {
841 if deadline.is_some() {
842 aho_match::aho_match_with_deadline(&self.index, &whole_query_run, deadline)?
843 } else {
844 aho_match(&self.index, &whole_query_run)
845 }
846 } else {
847 if deadline.is_some() {
848 aho_match::aho_match_with_extra_matchables(
849 &self.index,
850 &whole_query_run,
851 Some(&aho_extra_matchables),
852 deadline,
853 )?
854 } else {
855 aho_match::aho_match_with_extra_matchables(
856 &self.index,
857 &whole_query_run,
858 Some(&aho_extra_matchables),
859 None,
860 )?
861 }
862 };
863
864 let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
867 candidate_contained_matches.extend(refined_aho.clone());
868 let (merged_aho, _) = merge_and_prepare_aho_matches(
869 &self.index,
870 &mut query,
871 &mut matched_qspans,
872 &refined_aho,
873 );
874 all_matches.extend(merged_aho);
875
876 let whole_query_followup = collect_whole_query_exact_followup_matches(
877 &self.index,
878 &mut query,
879 &mut matched_qspans,
880 &whole_query_run,
881 deadline,
882 )?;
883 all_matches.extend(whole_query_followup);
884
885 let merged_seq = collect_regular_seq_matches(
886 &self.index,
887 &query,
888 &matched_qspans,
889 &candidate_contained_matches,
890 deadline,
891 )?;
892 all_matches.extend(merged_seq);
893 }
894
895 ensure_within_deadline(deadline)?;
898 let merged_matches =
899 refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
900
901 let refined_matches = if unknown_licenses {
904 let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
906
907 let unknown_matches = unknown_match(&self.index, &query, &good_matches);
909 let filtered_unknown =
910 filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
911
912 let mut all_matches = good_matches;
913 all_matches.extend(filtered_unknown);
914 all_matches.extend(weak_matches);
917 all_matches
918 } else {
919 merged_matches
920 };
921
922 ensure_within_deadline(deadline)?;
924 let refined = refine_matches(&self.index, refined_matches, &query);
925
926 let mut sorted = refined;
927 sort_matches_by_line(&mut sorted);
928
929 let groups = split_groups_across_frontmatter_boundary(
930 group_matches_by_region(&sorted),
931 Some(content),
932 );
933
934 let detections: Vec<LicenseDetection> = groups
935 .iter()
936 .map(|group| {
937 let mut detection = empty_detection();
938 populate_detection_from_group_with_spdx(
939 &mut detection,
940 group,
941 &self.spdx_mapping,
942 Some(content),
943 );
944 detection
945 })
946 .collect();
947
948 let detections = post_process_detections(detections, min_score);
949
950 ensure_within_deadline(deadline)?;
951 Ok(detections)
952 }
953
954 pub fn detect_with_kind_and_source(
955 &self,
956 text: &str,
957 unknown_licenses: bool,
958 binary_derived: bool,
959 source_path: &str,
960 ) -> Result<Vec<LicenseDetection>> {
961 self.detect_with_kind_and_source_with_deadline(
962 text,
963 unknown_licenses,
964 binary_derived,
965 source_path,
966 None,
967 )
968 }
969
970 pub(crate) fn detect_with_kind_and_source_with_deadline(
971 &self,
972 text: &str,
973 unknown_licenses: bool,
974 binary_derived: bool,
975 source_path: &str,
976 deadline: Option<Instant>,
977 ) -> Result<Vec<LicenseDetection>> {
978 let mut detections = self.detect_with_kind_with_score_and_deadline(
979 text,
980 unknown_licenses,
981 binary_derived,
982 0.0,
983 deadline,
984 )?;
985 attach_source_path_to_detections(&mut detections, source_path);
986 Ok(detections)
987 }
988
989 pub fn detect_with_kind_and_source_with_score(
990 &self,
991 text: &str,
992 unknown_licenses: bool,
993 binary_derived: bool,
994 source_path: &str,
995 min_score: f32,
996 ) -> Result<Vec<LicenseDetection>> {
997 let mut detections =
998 self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, min_score)?;
999 attach_source_path_to_detections(&mut detections, source_path);
1000 Ok(detections)
1001 }
1002
1003 pub(crate) fn detect_with_kind_and_source_with_score_and_deadline(
1004 &self,
1005 text: &str,
1006 unknown_licenses: bool,
1007 binary_derived: bool,
1008 source_path: &str,
1009 min_score: f32,
1010 deadline: Option<Instant>,
1011 ) -> Result<Vec<LicenseDetection>> {
1012 let mut detections = self.detect_with_kind_with_score_and_deadline(
1013 text,
1014 unknown_licenses,
1015 binary_derived,
1016 min_score,
1017 deadline,
1018 )?;
1019 attach_source_path_to_detections(&mut detections, source_path);
1020 Ok(detections)
1021 }
1022
1023 #[cfg(any(test, feature = "golden-tests"))]
1028 pub fn detect_matches_with_kind(
1029 &self,
1030 text: &str,
1031 unknown_licenses: bool,
1032 binary_derived: bool,
1033 ) -> Result<Vec<LicenseMatch>> {
1034 let clean_text = strip_utf8_bom_str(text);
1035
1036 let content = truncate_detection_text(clean_text);
1037
1038 let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
1039 let whole_query_run = query.whole_query_run();
1040
1041 let mut all_matches = Vec::new();
1042 let mut candidate_contained_matches = Vec::new();
1043 let mut aho_extra_matchables = PositionSet::new();
1044 let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
1045
1046 {
1048 let hash_matches = hash_match(&self.index, &whole_query_run);
1049
1050 if !hash_matches.is_empty() {
1051 let mut matches = hash_matches;
1052 sort_matches_by_line(&mut matches);
1053 return Ok(matches);
1054 }
1055 }
1056
1057 {
1059 let spdx_matches = spdx_lid_match(&self.index, &query);
1060 subtract_spdx_match_qspans(
1061 &mut query,
1062 &mut matched_qspans,
1063 &mut aho_extra_matchables,
1064 &spdx_matches,
1065 );
1066 all_matches.extend(spdx_matches);
1067 }
1068
1069 {
1071 let aho_matches = if aho_extra_matchables.is_empty() {
1072 aho_match(&self.index, &whole_query_run)
1073 } else {
1074 aho_match::aho_match_with_extra_matchables(
1075 &self.index,
1076 &whole_query_run,
1077 Some(&aho_extra_matchables),
1078 None,
1079 )?
1080 };
1081 let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
1082 candidate_contained_matches.extend(refined_aho.clone());
1083 let (merged_aho, _) = merge_and_prepare_aho_matches(
1084 &self.index,
1085 &mut query,
1086 &mut matched_qspans,
1087 &refined_aho,
1088 );
1089 all_matches.extend(merged_aho);
1090
1091 let whole_query_followup = collect_whole_query_exact_followup_matches(
1092 &self.index,
1093 &mut query,
1094 &mut matched_qspans,
1095 &whole_query_run,
1096 None,
1097 )?;
1098 all_matches.extend(whole_query_followup);
1099
1100 let merged_seq = collect_regular_seq_matches(
1101 &self.index,
1102 &query,
1103 &matched_qspans,
1104 &candidate_contained_matches,
1105 None,
1106 )?;
1107 all_matches.extend(merged_seq);
1108 }
1109
1110 let merged_matches =
1112 refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
1113
1114 let refined_matches = if unknown_licenses {
1116 let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
1117 let unknown_matches = unknown_match(&self.index, &query, &good_matches);
1118 let filtered_unknown =
1119 filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
1120
1121 let mut all_matches = good_matches;
1122 all_matches.extend(filtered_unknown);
1123 all_matches.extend(weak_matches);
1124 all_matches
1125 } else {
1126 merged_matches
1127 };
1128
1129 let refined = refine_matches(&self.index, refined_matches, &query);
1131
1132 let mut sorted = refined;
1133 sort_matches_by_line(&mut sorted);
1134
1135 Ok(sorted)
1137 }
1138
1139 pub fn index(&self) -> &index::LicenseIndex {
1141 &self.index
1142 }
1143
1144 pub fn spdx_license_list_version(&self) -> Option<&str> {
1145 self.spdx_license_list_version.as_deref()
1146 }
1147
1148 pub fn license_index_provenance(&self) -> Option<&LicenseIndexProvenance> {
1149 self.license_index_provenance.as_ref()
1150 }
1151
1152 #[cfg(test)]
1154 pub fn spdx_mapping(&self) -> &SpdxMapping {
1155 &self.spdx_mapping
1156 }
1157}
1158
1159pub fn detect_scancode_spdx_license_list_version(search_path: &Path) -> Result<Option<String>> {
1160 for ancestor in search_path.ancestors() {
1161 let candidate = ancestor.join("scancode_config.py");
1162 if candidate.is_file() {
1163 let config = fs::read_to_string(&candidate)?;
1164 return Ok(parse_scancode_spdx_license_list_version(&config));
1165 }
1166 }
1167
1168 Ok(None)
1169}
1170
1171fn parse_scancode_spdx_license_list_version(config: &str) -> Option<String> {
1172 config.lines().find_map(|line| {
1173 let trimmed = line.trim();
1174 let (_, value) = trimmed.split_once('=')?;
1175 (trimmed.starts_with("spdx_license_list_version")).then(|| {
1176 value
1177 .trim()
1178 .trim_matches('"')
1179 .trim_matches('\'')
1180 .to_string()
1181 })
1182 })
1183}
1184
1185#[cfg(test)]
1186mod tests;