1pub mod aho_match;
4pub mod automaton;
5pub mod build_policy;
6pub mod dataset;
7pub(crate) mod detection;
8pub mod embedded;
9pub mod license_cache;
10mod position_set;
11mod token_multiset;
12mod token_set;
13
14#[cfg(test)]
15mod embedded_test;
16pub mod expression;
17#[cfg(all(test, feature = "golden-tests"))]
18mod golden_test;
19#[cfg(feature = "golden-tests")]
20pub mod golden_utils;
21pub mod hash_match;
22pub mod index;
23mod match_refine;
24pub mod models;
25pub mod query;
26pub mod rules;
27pub mod seq_match;
28pub mod spdx_lid;
29pub mod spdx_mapping;
30#[cfg(test)]
31mod test_utils;
32pub mod tokenize;
33pub mod unknown_match;
34
35use bit_set::BitSet;
36use std::collections::HashSet;
37use std::fs;
38use std::path::Path;
39use std::sync::Arc;
40use std::time::Instant;
41
42use anyhow::Result;
43
44use crate::license_detection::build_policy::EMBEDDED_LICENSE_INDEX_SOURCE;
45use crate::license_detection::dataset::{
46 CUSTOM_LICENSE_DATASET_SOURCE, LoadedLicenseDataset, compute_dataset_fingerprint_string,
47 load_license_dataset_from_root,
48};
49use crate::license_detection::embedded::index::{
50 load_embedded_artifact_metadata_from_bytes, load_loader_snapshot_from_bytes,
51};
52use crate::license_detection::index::build_index_from_loaded;
53use crate::license_detection::license_cache::{
54 LicenseCacheConfig, LicenseCacheNamespace, cache_file_size, compute_artifact_fingerprint,
55 compute_rules_fingerprint, delete_cache, load_cached_index, save_cached_index,
56};
57use crate::license_detection::query::Query;
58use crate::license_detection::spdx_mapping::{SpdxMapping, build_spdx_mapping};
59use crate::models::LicenseIndexProvenance;
60use crate::utils::text::strip_utf8_bom_str;
61
62use crate::license_detection::detection::{
63 attach_source_path_to_detections, empty_detection, populate_detection_from_group_with_spdx,
64};
65use crate::license_detection::models::MatcherKind;
66
67#[allow(dead_code)]
70pub const SCANCODE_LICENSES_RULES_PATH: &str =
71 "reference/scancode-toolkit/src/licensedcode/data/rules";
72
73#[allow(dead_code)]
76pub const SCANCODE_LICENSES_LICENSES_PATH: &str =
77 "reference/scancode-toolkit/src/licensedcode/data/licenses";
78
79#[allow(dead_code)]
82pub const SCANCODE_LICENSES_DATA_PATH: &str = "reference/scancode-toolkit/src/licensedcode/data";
83
84pub const DEFAULT_LICENSEDB_URL_TEMPLATE: &str = "https://scancode-licensedb.aboutcode.org/{}";
85pub(crate) const LICENSE_DETECTION_TIMEOUT_MESSAGE: &str = "license detection timed out";
86
87pub(crate) use detection::{
88 LicenseDetection, group_matches_by_region, post_process_detections, sort_matches_by_line,
89};
90pub use models::LicenseMatch;
91
92pub use aho_match::aho_match;
93pub use hash_match::hash_match;
94pub use match_refine::{
95 filter_invalid_contained_unknown_matches, merge_overlapping_matches, refine_matches,
96 refine_matches_without_false_positive_filter, split_weak_matches,
97};
98pub use position_set::PositionSet;
99pub use spdx_lid::spdx_lid_match;
100pub use token_multiset::TokenMultiset;
101pub use token_set::TokenSet;
102pub use unknown_match::unknown_match;
103
104use self::seq_match::{
105 MAX_NEAR_DUPE_CANDIDATES, select_seq_candidates_with_deadline,
106 seq_match_with_candidates_and_deadline,
107};
108
109#[derive(Debug, Clone)]
115pub struct LicenseDetectionEngine {
116 index: Arc<index::LicenseIndex>,
117 spdx_mapping: SpdxMapping,
118 spdx_license_list_version: Option<String>,
119 license_index_provenance: Option<LicenseIndexProvenance>,
120}
121
122const MAX_DETECTION_SIZE: usize = 10 * 1024 * 1024; const MAX_REGULAR_SEQ_CANDIDATES: usize = 70;
124const MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP: usize = 8;
125const MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP: usize = 2;
126
127pub(crate) fn deadline_exceeded(deadline: Option<Instant>) -> bool {
128 deadline.is_some_and(|deadline| Instant::now() >= deadline)
129}
130
131pub(crate) fn ensure_within_deadline(deadline: Option<Instant>) -> Result<()> {
132 if deadline_exceeded(deadline) {
133 Err(anyhow::anyhow!(LICENSE_DETECTION_TIMEOUT_MESSAGE))
134 } else {
135 Ok(())
136 }
137}
138
139fn truncate_detection_text(clean_text: &str) -> &str {
140 if clean_text.len() <= MAX_DETECTION_SIZE {
141 return clean_text;
142 }
143
144 log::debug!(
145 "Content size {} exceeds limit {}, truncating for detection",
146 clean_text.len(),
147 MAX_DETECTION_SIZE
148 );
149
150 let boundary = clean_text.floor_char_boundary(MAX_DETECTION_SIZE);
151 &clean_text[..boundary]
152}
153
154fn query_span_for_match(m: &LicenseMatch) -> Option<models::PositionSpan> {
155 (!m.query_span().is_empty()).then(|| m.query_span().clone())
156}
157
158fn has_full_match_coverage(m: &LicenseMatch) -> bool {
159 m.coverage() == 100.0
160}
161
162fn is_redundant_same_expression_seq_container(
163 container: &LicenseMatch,
164 candidate_contained_matches: &[LicenseMatch],
165) -> bool {
166 let container_is_redundant_coverage =
167 has_full_match_coverage(container) || container.coverage() >= 99.0;
168 if container.matcher != MatcherKind::Seq || !container_is_redundant_coverage {
169 return false;
170 }
171
172 let container_qspan_set = container.qspan_set();
173
174 let mut contained: Vec<&LicenseMatch> = candidate_contained_matches
175 .iter()
176 .filter(|m| {
177 m.matcher == MatcherKind::Aho
178 && has_full_match_coverage(m)
179 && m.license_expression == container.license_expression
180 && m.overlaps_with(&container_qspan_set)
181 })
182 .collect();
183
184 if contained.len() < 2 {
185 return false;
186 }
187
188 let material_children = contained.iter().filter(|m| m.matched_length > 1).count();
189 if material_children < 2 {
190 return false;
191 }
192
193 contained.sort_by_key(|m| m.qspan_bounds());
194
195 let mut child_union = PositionSet::new();
196 for m in &contained {
197 child_union.extend_from_span(m.query_span());
198 }
199
200 let container_only_positions = container_qspan_set.difference(&child_union);
201 let child_only_positions = child_union.difference(&container_qspan_set);
202
203 let mut bridge_positions = BitSet::new();
204 for pair in contained.windows(2) {
205 let (_, previous_end) = pair[0].qspan_bounds();
206 let (next_start, _) = pair[1].qspan_bounds();
207
208 if next_start < previous_end {
209 return false;
210 }
211
212 for pos in previous_end..next_start {
213 bridge_positions.insert(pos);
214 }
215 }
216
217 let container_only_boundary_positions = container_only_positions
218 .iter()
219 .filter(|&pos| !bridge_positions.contains(pos))
220 .count();
221
222 if container_only_positions.len() == 1
223 && container_only_boundary_positions == 0
224 && child_only_positions.is_empty()
225 {
226 return false;
227 }
228
229 if child_only_positions.is_empty()
230 && container_only_positions.len() == container_only_boundary_positions
231 && container_only_boundary_positions <= 3
232 {
233 let earliest_child = contained
234 .iter()
235 .map(|m| m.qspan_bounds().0)
236 .min()
237 .unwrap_or(usize::MAX);
238 let latest_child = contained
239 .iter()
240 .map(|m| m.qspan_bounds().1.saturating_sub(1))
241 .max()
242 .unwrap_or(0);
243
244 let is_one_sided_boundary = container_only_positions
245 .iter()
246 .all(|pos| pos < earliest_child)
247 || container_only_positions
248 .iter()
249 .all(|pos| pos > latest_child);
250
251 if is_one_sided_boundary {
252 return false;
253 }
254 }
255
256 let max_container_only_positions =
257 MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * contained.len() + 1;
258 let max_container_boundary_positions =
259 MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * (contained.len() - 1);
260 let max_child_only_positions = MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP + 1;
261
262 container_only_positions.len() <= max_container_only_positions
263 && container_only_boundary_positions <= max_container_boundary_positions
264 && child_only_positions.len() <= max_child_only_positions
265}
266
267fn filter_redundant_same_expression_seq_containers(
268 seq_matches: Vec<LicenseMatch>,
269 candidate_contained_matches: &[LicenseMatch],
270) -> Vec<LicenseMatch> {
271 seq_matches
272 .into_iter()
273 .filter(|m| !is_redundant_same_expression_seq_container(m, candidate_contained_matches))
274 .collect()
275}
276
277fn is_redundant_low_coverage_composite_seq_wrapper(
278 container: &LicenseMatch,
279 candidate_contained_matches: &[LicenseMatch],
280) -> bool {
281 if container.matcher != seq_match::MATCH_SEQ || container.coverage() >= 30.0 {
282 return false;
283 }
284
285 let container_qspan_set = container.qspan_set();
286
287 let children: Vec<&LicenseMatch> = candidate_contained_matches
288 .iter()
289 .filter(|m| {
290 m.matcher == aho_match::MATCH_AHO
291 && has_full_match_coverage(m)
292 && m.license_expression != container.license_expression
293 && m.overlaps_with(&container_qspan_set)
294 })
295 .collect();
296
297 if children.len() < 2 {
298 return false;
299 }
300
301 let unique_expressions: HashSet<&str> = children
302 .iter()
303 .map(|m| m.license_expression.as_str())
304 .collect();
305 if unique_expressions.len() < 2 {
306 return false;
307 }
308
309 let mut child_union = PositionSet::new();
310 for m in &children {
311 child_union.extend_from_span(m.query_span());
312 }
313
314 let container_only_positions = container_qspan_set.difference(&child_union);
315 let child_only_positions = child_union.difference(&container_qspan_set);
316
317 let mut sorted_children = children;
318 sorted_children.sort_by_key(|m| m.qspan_bounds());
319
320 let mut bridge_positions = BitSet::new();
321 for pair in sorted_children.windows(2) {
322 let (_, previous_end) = pair[0].qspan_bounds();
323 let (next_start, _) = pair[1].qspan_bounds();
324 for pos in previous_end..next_start {
325 bridge_positions.insert(pos);
326 }
327 }
328
329 let container_only_boundary_positions = container_only_positions
330 .iter()
331 .filter(|&pos| !bridge_positions.contains(pos))
332 .count();
333
334 child_only_positions.is_empty()
335 && container_only_positions.len() <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
336 && container_only_boundary_positions <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
337}
338
339fn filter_redundant_low_coverage_composite_seq_wrappers(
340 seq_matches: Vec<LicenseMatch>,
341 candidate_contained_matches: &[LicenseMatch],
342) -> Vec<LicenseMatch> {
343 seq_matches
344 .into_iter()
345 .filter(|m| {
346 !is_redundant_low_coverage_composite_seq_wrapper(m, candidate_contained_matches)
347 })
348 .collect()
349}
350
351fn subtract_spdx_match_qspans(
352 query: &mut Query<'_>,
353 matched_qspans: &mut Vec<models::PositionSpan>,
354 aho_extra_matchables: &mut PositionSet,
355 spdx_matches: &[LicenseMatch],
356) {
357 for m in spdx_matches {
358 let Some(span) = query_span_for_match(m) else {
359 continue;
360 };
361
362 aho_extra_matchables.extend_from_span(&span);
363 query.subtract(&span);
364
365 if has_full_match_coverage(m) {
366 matched_qspans.push(span);
367 }
368 }
369}
370
371fn merge_and_prepare_aho_matches(
372 index: &index::LicenseIndex,
373 query: &mut Query<'_>,
374 matched_qspans: &mut Vec<models::PositionSpan>,
375 refined_aho: &[LicenseMatch],
376) -> (Vec<LicenseMatch>, bool) {
377 let merged_aho = merge_overlapping_matches(refined_aho);
378 let mut saw_long_exact_license_text_match = false;
379
380 for m in &merged_aho {
381 let Some(span) = query_span_for_match(m) else {
382 continue;
383 };
384
385 if has_full_match_coverage(m) {
386 matched_qspans.push(span.clone());
387 }
388
389 if index
390 .rules_by_rid
391 .get(m.rid)
392 .is_some_and(|rule| rule.is_license_text())
393 && m.rule_length > 120
394 && m.coverage() > 98.0
395 {
396 query.subtract(&span);
397 saw_long_exact_license_text_match = true;
398 }
399 }
400
401 (merged_aho, saw_long_exact_license_text_match)
402}
403
404fn collect_whole_query_exact_followup_matches(
405 index: &index::LicenseIndex,
406 query: &mut Query<'_>,
407 matched_qspans: &mut Vec<models::PositionSpan>,
408 whole_run: &query::QueryRun<'_>,
409 deadline: Option<Instant>,
410) -> Result<Vec<LicenseMatch>> {
411 let mut seq_all_matches = Vec::new();
412
413 if whole_run.is_matchable(false, matched_qspans) {
414 let near_dupe_candidates = if deadline.is_some() {
415 select_seq_candidates_with_deadline(
416 index,
417 whole_run,
418 true,
419 MAX_NEAR_DUPE_CANDIDATES,
420 deadline,
421 )?
422 } else {
423 self::seq_match::select_seq_candidates(index, whole_run, true, MAX_NEAR_DUPE_CANDIDATES)
424 };
425
426 if !near_dupe_candidates.is_empty() {
427 let near_dupe_matches = if deadline.is_some() {
428 seq_match_with_candidates_and_deadline(
429 index,
430 whole_run,
431 &near_dupe_candidates,
432 deadline,
433 )?
434 } else {
435 self::seq_match::seq_match_with_candidates(index, whole_run, &near_dupe_candidates)
436 };
437
438 for m in &near_dupe_matches {
439 if !m.query_span().is_empty() {
440 let span = m.query_span().clone();
441 query.subtract(&span);
442 matched_qspans.push(span);
443 }
444 }
445
446 seq_all_matches.extend(near_dupe_matches);
447 }
448 }
449
450 Ok(seq_all_matches)
451}
452
453fn collect_regular_seq_matches(
454 index: &index::LicenseIndex,
455 query: &Query<'_>,
456 matched_qspans: &[models::PositionSpan],
457 candidate_contained_matches: &[LicenseMatch],
458 deadline: Option<Instant>,
459) -> Result<Vec<LicenseMatch>> {
460 let mut seq_all_matches = Vec::new();
461
462 for (query_run_index, query_run) in query.query_runs().into_iter().enumerate() {
463 if query_run_index % 8 == 0 {
464 ensure_within_deadline(deadline)?;
465 }
466
467 if !query_run.is_matchable(false, matched_qspans) {
468 continue;
469 }
470
471 let candidates = if deadline.is_some() {
472 select_seq_candidates_with_deadline(
473 index,
474 &query_run,
475 false,
476 MAX_REGULAR_SEQ_CANDIDATES,
477 deadline,
478 )?
479 } else {
480 self::seq_match::select_seq_candidates(
481 index,
482 &query_run,
483 false,
484 MAX_REGULAR_SEQ_CANDIDATES,
485 )
486 };
487 if !candidates.is_empty() {
488 let matches = if deadline.is_some() {
489 seq_match_with_candidates_and_deadline(index, &query_run, &candidates, deadline)?
490 } else {
491 self::seq_match::seq_match_with_candidates(index, &query_run, &candidates)
492 };
493 seq_all_matches.extend(matches);
494 }
495 }
496
497 let merged_seq = merge_overlapping_matches(&seq_all_matches);
498 let filtered_same_expression =
499 filter_redundant_same_expression_seq_containers(merged_seq, candidate_contained_matches);
500 Ok(filter_redundant_low_coverage_composite_seq_wrappers(
501 filtered_same_expression,
502 candidate_contained_matches,
503 ))
504}
505
506impl LicenseDetectionEngine {
507 fn from_index(
512 index: index::LicenseIndex,
513 spdx_license_list_version: Option<String>,
514 license_index_provenance: Option<LicenseIndexProvenance>,
515 ) -> Result<Self> {
516 let mut license_vec: Vec<_> = index.licenses_by_key.values().cloned().collect();
517 license_vec.sort_by(|a, b| a.key.cmp(&b.key));
518 let spdx_mapping = build_spdx_mapping(&license_vec);
519
520 Ok(Self {
521 index: Arc::new(index),
522 spdx_mapping,
523 spdx_license_list_version,
524 license_index_provenance,
525 })
526 }
527
528 #[cfg(test)]
529 pub(crate) fn from_test_index(index: index::LicenseIndex) -> Self {
530 Self::from_index(index, None, None).expect("test index should build license engine")
531 }
532
533 pub fn from_embedded() -> Result<Self> {
538 let cache_config =
539 LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
540 Self::from_embedded_with_cache(&cache_config)
541 }
542
543 pub fn from_embedded_with_cache(cache_config: &LicenseCacheConfig) -> Result<Self> {
558 let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
559 let fingerprint = compute_artifact_fingerprint(artifact_bytes);
560 let artifact_metadata = load_embedded_artifact_metadata_from_bytes(artifact_bytes)
561 .map_err(|e| {
562 anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
563 })?;
564 debug_assert_eq!(
565 artifact_metadata.license_index_provenance.source,
566 EMBEDDED_LICENSE_INDEX_SOURCE
567 );
568 let spdx_version = Some(artifact_metadata.spdx_license_list_version.clone());
569 let provenance = Some(artifact_metadata.license_index_provenance.clone());
570
571 if !cache_config.reindex {
572 if let Some(cached) =
573 load_cached_index(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?
574 {
575 let start = Instant::now();
576 eprintln!(
577 "License index loaded from rkyv cache in {:.2}s",
578 start.elapsed().as_secs_f64()
579 );
580 return Self::from_index(cached, spdx_version, provenance);
581 }
582 } else {
583 delete_cache(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?;
584 }
585
586 let snapshot = load_loader_snapshot_from_bytes(artifact_bytes)
587 .map_err(|e| anyhow::anyhow!("Failed to load embedded license index: {}", e))?;
588 let spdx_version = Some(snapshot.metadata.spdx_license_list_version.clone());
589 let provenance = Some(snapshot.metadata.license_index_provenance.clone());
590
591 let start = Instant::now();
592 let index = build_index_from_loaded(snapshot.rules, snapshot.licenses, false);
593 eprintln!(
594 "License index built from embedded artifact in {:.2}s",
595 start.elapsed().as_secs_f64()
596 );
597
598 let mut index = index;
599 index.spdx_license_list_version = spdx_version.clone();
600 if let Err(e) = save_cached_index(
601 cache_config,
602 LicenseCacheNamespace::Embedded,
603 &index,
604 &fingerprint,
605 ) {
606 eprintln!("Warning: failed to save license index cache: {}", e);
607 } else if let Some(size) =
608 cache_file_size(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)
609 {
610 eprintln!(
611 "License index cache saved ({:.1} MB)",
612 size as f64 / 1_048_576.0
613 );
614 }
615
616 Self::from_index(index, spdx_version, provenance)
617 }
618
619 pub fn from_directory(rules_path: &Path) -> Result<Self> {
624 let cache_config =
625 LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
626 Self::from_directory_with_cache(rules_path, &cache_config)
627 }
628
629 pub fn from_directory_with_cache(
641 rules_path: &Path,
642 cache_config: &LicenseCacheConfig,
643 ) -> Result<Self> {
644 let LoadedLicenseDataset {
645 manifest,
646 rules: loaded_rules,
647 licenses: loaded_licenses,
648 } = load_license_dataset_from_root(rules_path)?;
649
650 let fingerprint = compute_rules_fingerprint(&loaded_rules, &loaded_licenses)?;
651 let provenance = Some(LicenseIndexProvenance {
652 source: CUSTOM_LICENSE_DATASET_SOURCE.to_string(),
653 dataset_fingerprint: compute_dataset_fingerprint_string(
654 &loaded_rules,
655 &loaded_licenses,
656 )?,
657 ignored_rules: vec![],
658 ignored_licenses: vec![],
659 ignored_rules_due_to_licenses: vec![],
660 added_rules: vec![],
661 replaced_rules: vec![],
662 added_licenses: vec![],
663 replaced_licenses: vec![],
664 });
665
666 if !cache_config.reindex {
667 if let Some(cached) = load_cached_index(
668 cache_config,
669 LicenseCacheNamespace::CustomRules,
670 &fingerprint,
671 )? {
672 let start = Instant::now();
673 eprintln!(
674 "License index loaded from rkyv cache in {:.2}s",
675 start.elapsed().as_secs_f64()
676 );
677 return Self::from_index(
678 cached,
679 Some(manifest.spdx_license_list_version),
680 provenance,
681 );
682 }
683 } else {
684 delete_cache(
685 cache_config,
686 LicenseCacheNamespace::CustomRules,
687 &fingerprint,
688 )?;
689 }
690
691 let start = Instant::now();
692 let index = build_index_from_loaded(loaded_rules, loaded_licenses, false);
693 eprintln!(
694 "License index built from custom dataset in {:.2}s",
695 start.elapsed().as_secs_f64()
696 );
697
698 if let Err(e) = save_cached_index(
699 cache_config,
700 LicenseCacheNamespace::CustomRules,
701 &index,
702 &fingerprint,
703 ) {
704 eprintln!("Warning: failed to save license index cache: {}", e);
705 } else if let Some(size) = cache_file_size(
706 cache_config,
707 LicenseCacheNamespace::CustomRules,
708 &fingerprint,
709 ) {
710 eprintln!(
711 "License index cache saved ({:.1} MB)",
712 size as f64 / 1_048_576.0
713 );
714 }
715
716 Self::from_index(index, Some(manifest.spdx_license_list_version), provenance)
717 }
718
719 pub fn embedded_spdx_license_list_version() -> Result<String> {
720 let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
721 Ok(load_embedded_artifact_metadata_from_bytes(artifact_bytes)
722 .map_err(|e| {
723 anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
724 })?
725 .spdx_license_list_version)
726 }
727
728 pub fn detect_with_kind(
729 &self,
730 text: &str,
731 unknown_licenses: bool,
732 binary_derived: bool,
733 ) -> Result<Vec<LicenseDetection>> {
734 self.detect_with_kind_with_score_and_deadline(
735 text,
736 unknown_licenses,
737 binary_derived,
738 0.0,
739 None,
740 )
741 }
742
743 pub fn detect_with_kind_with_score(
744 &self,
745 text: &str,
746 unknown_licenses: bool,
747 binary_derived: bool,
748 min_score: f32,
749 ) -> Result<Vec<LicenseDetection>> {
750 self.detect_with_kind_with_score_and_deadline(
751 text,
752 unknown_licenses,
753 binary_derived,
754 min_score,
755 None,
756 )
757 }
758
759 pub(crate) fn detect_with_kind_with_score_and_deadline(
760 &self,
761 text: &str,
762 unknown_licenses: bool,
763 binary_derived: bool,
764 min_score: f32,
765 deadline: Option<Instant>,
766 ) -> Result<Vec<LicenseDetection>> {
767 ensure_within_deadline(deadline)?;
768 let clean_text = strip_utf8_bom_str(text);
769
770 let content = truncate_detection_text(clean_text);
771
772 ensure_within_deadline(deadline)?;
773 let mut query = if deadline.is_some() {
774 Query::from_extracted_text_with_deadline(
775 content,
776 &self.index,
777 binary_derived,
778 deadline,
779 )?
780 } else {
781 Query::from_extracted_text(content, &self.index, binary_derived)?
782 };
783 let whole_query_run = query.whole_query_run();
784
785 let mut all_matches = Vec::new();
786 let mut candidate_contained_matches = Vec::new();
787 let mut aho_extra_matchables = PositionSet::new();
788 let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
789
790 {
793 ensure_within_deadline(deadline)?;
794 let hash_matches = hash_match(&self.index, &whole_query_run);
795
796 if !hash_matches.is_empty() {
797 let mut matches = hash_matches;
798 sort_matches_by_line(&mut matches);
799
800 let groups = group_matches_by_region(&matches);
801 let detections: Vec<LicenseDetection> = groups
802 .iter()
803 .map(|group| {
804 let mut detection = empty_detection();
805 populate_detection_from_group_with_spdx(
806 &mut detection,
807 group,
808 &self.spdx_mapping,
809 Some(content),
810 );
811 detection
812 })
813 .collect();
814
815 return Ok(post_process_detections(detections, min_score));
816 }
817 }
818
819 {
821 ensure_within_deadline(deadline)?;
822 let spdx_matches = spdx_lid_match(&self.index, &query);
823 subtract_spdx_match_qspans(
824 &mut query,
825 &mut matched_qspans,
826 &mut aho_extra_matchables,
827 &spdx_matches,
828 );
829 all_matches.extend(spdx_matches);
830 }
831
832 {
834 ensure_within_deadline(deadline)?;
835 let aho_matches = if aho_extra_matchables.is_empty() {
836 if deadline.is_some() {
837 aho_match::aho_match_with_deadline(&self.index, &whole_query_run, deadline)?
838 } else {
839 aho_match(&self.index, &whole_query_run)
840 }
841 } else {
842 if deadline.is_some() {
843 aho_match::aho_match_with_extra_matchables(
844 &self.index,
845 &whole_query_run,
846 Some(&aho_extra_matchables),
847 deadline,
848 )?
849 } else {
850 aho_match::aho_match_with_extra_matchables(
851 &self.index,
852 &whole_query_run,
853 Some(&aho_extra_matchables),
854 None,
855 )?
856 }
857 };
858
859 let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
862 candidate_contained_matches.extend(refined_aho.clone());
863 let (merged_aho, _) = merge_and_prepare_aho_matches(
864 &self.index,
865 &mut query,
866 &mut matched_qspans,
867 &refined_aho,
868 );
869 all_matches.extend(merged_aho);
870
871 let whole_query_followup = collect_whole_query_exact_followup_matches(
872 &self.index,
873 &mut query,
874 &mut matched_qspans,
875 &whole_query_run,
876 deadline,
877 )?;
878 all_matches.extend(whole_query_followup);
879
880 let merged_seq = collect_regular_seq_matches(
881 &self.index,
882 &query,
883 &matched_qspans,
884 &candidate_contained_matches,
885 deadline,
886 )?;
887 all_matches.extend(merged_seq);
888 }
889
890 ensure_within_deadline(deadline)?;
893 let merged_matches =
894 refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
895
896 let refined_matches = if unknown_licenses {
899 let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
901
902 let unknown_matches = unknown_match(&self.index, &query, &good_matches);
904 let filtered_unknown =
905 filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
906
907 let mut all_matches = good_matches;
908 all_matches.extend(filtered_unknown);
909 all_matches.extend(weak_matches);
912 all_matches
913 } else {
914 merged_matches
915 };
916
917 ensure_within_deadline(deadline)?;
919 let refined = refine_matches(&self.index, refined_matches, &query);
920
921 let mut sorted = refined;
922 sort_matches_by_line(&mut sorted);
923
924 let groups = group_matches_by_region(&sorted);
925
926 let detections: Vec<LicenseDetection> = groups
927 .iter()
928 .map(|group| {
929 let mut detection = empty_detection();
930 populate_detection_from_group_with_spdx(
931 &mut detection,
932 group,
933 &self.spdx_mapping,
934 Some(content),
935 );
936 detection
937 })
938 .collect();
939
940 let detections = post_process_detections(detections, min_score);
941
942 ensure_within_deadline(deadline)?;
943 Ok(detections)
944 }
945
946 pub fn detect_with_kind_and_source(
947 &self,
948 text: &str,
949 unknown_licenses: bool,
950 binary_derived: bool,
951 source_path: &str,
952 ) -> Result<Vec<LicenseDetection>> {
953 self.detect_with_kind_and_source_with_deadline(
954 text,
955 unknown_licenses,
956 binary_derived,
957 source_path,
958 None,
959 )
960 }
961
962 pub(crate) fn detect_with_kind_and_source_with_deadline(
963 &self,
964 text: &str,
965 unknown_licenses: bool,
966 binary_derived: bool,
967 source_path: &str,
968 deadline: Option<Instant>,
969 ) -> Result<Vec<LicenseDetection>> {
970 let mut detections = self.detect_with_kind_with_score_and_deadline(
971 text,
972 unknown_licenses,
973 binary_derived,
974 0.0,
975 deadline,
976 )?;
977 attach_source_path_to_detections(&mut detections, source_path);
978 Ok(detections)
979 }
980
981 pub fn detect_with_kind_and_source_with_score(
982 &self,
983 text: &str,
984 unknown_licenses: bool,
985 binary_derived: bool,
986 source_path: &str,
987 min_score: f32,
988 ) -> Result<Vec<LicenseDetection>> {
989 let mut detections =
990 self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, min_score)?;
991 attach_source_path_to_detections(&mut detections, source_path);
992 Ok(detections)
993 }
994
995 pub(crate) fn detect_with_kind_and_source_with_score_and_deadline(
996 &self,
997 text: &str,
998 unknown_licenses: bool,
999 binary_derived: bool,
1000 source_path: &str,
1001 min_score: f32,
1002 deadline: Option<Instant>,
1003 ) -> Result<Vec<LicenseDetection>> {
1004 let mut detections = self.detect_with_kind_with_score_and_deadline(
1005 text,
1006 unknown_licenses,
1007 binary_derived,
1008 min_score,
1009 deadline,
1010 )?;
1011 attach_source_path_to_detections(&mut detections, source_path);
1012 Ok(detections)
1013 }
1014
1015 #[cfg(any(test, feature = "golden-tests"))]
1020 pub fn detect_matches_with_kind(
1021 &self,
1022 text: &str,
1023 unknown_licenses: bool,
1024 binary_derived: bool,
1025 ) -> Result<Vec<LicenseMatch>> {
1026 let clean_text = strip_utf8_bom_str(text);
1027
1028 let content = truncate_detection_text(clean_text);
1029
1030 let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
1031 let whole_query_run = query.whole_query_run();
1032
1033 let mut all_matches = Vec::new();
1034 let mut candidate_contained_matches = Vec::new();
1035 let mut aho_extra_matchables = PositionSet::new();
1036 let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
1037
1038 {
1040 let hash_matches = hash_match(&self.index, &whole_query_run);
1041
1042 if !hash_matches.is_empty() {
1043 let mut matches = hash_matches;
1044 sort_matches_by_line(&mut matches);
1045 return Ok(matches);
1046 }
1047 }
1048
1049 {
1051 let spdx_matches = spdx_lid_match(&self.index, &query);
1052 subtract_spdx_match_qspans(
1053 &mut query,
1054 &mut matched_qspans,
1055 &mut aho_extra_matchables,
1056 &spdx_matches,
1057 );
1058 all_matches.extend(spdx_matches);
1059 }
1060
1061 {
1063 let aho_matches = if aho_extra_matchables.is_empty() {
1064 aho_match(&self.index, &whole_query_run)
1065 } else {
1066 aho_match::aho_match_with_extra_matchables(
1067 &self.index,
1068 &whole_query_run,
1069 Some(&aho_extra_matchables),
1070 None,
1071 )?
1072 };
1073 let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
1074 candidate_contained_matches.extend(refined_aho.clone());
1075 let (merged_aho, _) = merge_and_prepare_aho_matches(
1076 &self.index,
1077 &mut query,
1078 &mut matched_qspans,
1079 &refined_aho,
1080 );
1081 all_matches.extend(merged_aho);
1082
1083 let whole_query_followup = collect_whole_query_exact_followup_matches(
1084 &self.index,
1085 &mut query,
1086 &mut matched_qspans,
1087 &whole_query_run,
1088 None,
1089 )?;
1090 all_matches.extend(whole_query_followup);
1091
1092 let merged_seq = collect_regular_seq_matches(
1093 &self.index,
1094 &query,
1095 &matched_qspans,
1096 &candidate_contained_matches,
1097 None,
1098 )?;
1099 all_matches.extend(merged_seq);
1100 }
1101
1102 let merged_matches =
1104 refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
1105
1106 let refined_matches = if unknown_licenses {
1108 let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
1109 let unknown_matches = unknown_match(&self.index, &query, &good_matches);
1110 let filtered_unknown =
1111 filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
1112
1113 let mut all_matches = good_matches;
1114 all_matches.extend(filtered_unknown);
1115 all_matches.extend(weak_matches);
1116 all_matches
1117 } else {
1118 merged_matches
1119 };
1120
1121 let refined = refine_matches(&self.index, refined_matches, &query);
1123
1124 let mut sorted = refined;
1125 sort_matches_by_line(&mut sorted);
1126
1127 Ok(sorted)
1129 }
1130
1131 pub fn index(&self) -> &index::LicenseIndex {
1133 &self.index
1134 }
1135
1136 pub fn spdx_license_list_version(&self) -> Option<&str> {
1137 self.spdx_license_list_version.as_deref()
1138 }
1139
1140 pub fn license_index_provenance(&self) -> Option<&LicenseIndexProvenance> {
1141 self.license_index_provenance.as_ref()
1142 }
1143
1144 #[cfg(test)]
1146 pub fn spdx_mapping(&self) -> &SpdxMapping {
1147 &self.spdx_mapping
1148 }
1149}
1150
1151pub fn detect_scancode_spdx_license_list_version(search_path: &Path) -> Result<Option<String>> {
1152 for ancestor in search_path.ancestors() {
1153 let candidate = ancestor.join("scancode_config.py");
1154 if candidate.is_file() {
1155 let config = fs::read_to_string(&candidate)?;
1156 return Ok(parse_scancode_spdx_license_list_version(&config));
1157 }
1158 }
1159
1160 Ok(None)
1161}
1162
1163fn parse_scancode_spdx_license_list_version(config: &str) -> Option<String> {
1164 config.lines().find_map(|line| {
1165 let trimmed = line.trim();
1166 let (_, value) = trimmed.split_once('=')?;
1167 (trimmed.starts_with("spdx_license_list_version")).then(|| {
1168 value
1169 .trim()
1170 .trim_matches('"')
1171 .trim_matches('\'')
1172 .to_string()
1173 })
1174 })
1175}
1176
1177#[cfg(test)]
1178mod tests;