1pub mod aho_match;
7pub mod automaton;
8pub mod build_policy;
9pub mod dataset;
10pub(crate) mod detection;
11pub mod embedded;
12pub mod license_cache;
13mod position_set;
14mod token_multiset;
15mod token_set;
16
17#[cfg(test)]
18mod embedded_test;
19pub mod expression;
20#[cfg(all(test, feature = "golden-tests"))]
21mod golden_test;
22#[cfg(feature = "golden-tests")]
23pub mod golden_utils;
24pub mod hash_match;
25pub mod index;
26mod match_refine;
27pub mod models;
28pub mod query;
29pub mod rules;
30pub mod seq_match;
31pub mod spdx_lid;
32pub mod spdx_mapping;
33#[cfg(test)]
34mod test_utils;
35pub mod tokenize;
36pub mod unknown_match;
37
38use bit_set::BitSet;
39use std::collections::HashSet;
40use std::fs;
41use std::path::Path;
42use std::sync::Arc;
43use std::time::Instant;
44
45use anyhow::Result;
46
47use crate::license_detection::build_policy::EMBEDDED_LICENSE_INDEX_SOURCE;
48use crate::license_detection::dataset::{
49 CUSTOM_LICENSE_DATASET_SOURCE, LoadedLicenseDataset, compute_dataset_fingerprint_string,
50 load_license_dataset_from_root,
51};
52use crate::license_detection::embedded::index::{
53 load_embedded_artifact_metadata_from_bytes, load_loader_snapshot_from_bytes,
54};
55use crate::license_detection::index::build_index_from_loaded;
56use crate::license_detection::license_cache::{
57 LicenseCacheConfig, LicenseCacheNamespace, cache_file_size, compute_artifact_fingerprint,
58 compute_rules_fingerprint, delete_cache, load_cached_index, save_cached_index,
59};
60use crate::license_detection::query::Query;
61use crate::license_detection::spdx_mapping::{SpdxMapping, build_spdx_mapping};
62use crate::models::LicenseIndexProvenance;
63use crate::utils::text::strip_utf8_bom_str;
64
65use crate::license_detection::detection::{
66 attach_source_path_to_detections, empty_detection, populate_detection_from_group_with_spdx,
67};
68use crate::license_detection::models::MatcherKind;
69
70#[allow(dead_code)]
73pub const SCANCODE_LICENSES_RULES_PATH: &str =
74 "reference/scancode-toolkit/src/licensedcode/data/rules";
75
76#[allow(dead_code)]
79pub const SCANCODE_LICENSES_LICENSES_PATH: &str =
80 "reference/scancode-toolkit/src/licensedcode/data/licenses";
81
82#[allow(dead_code)]
85pub const SCANCODE_LICENSES_DATA_PATH: &str = "reference/scancode-toolkit/src/licensedcode/data";
86
87pub const DEFAULT_LICENSEDB_URL_TEMPLATE: &str = "https://scancode-licensedb.aboutcode.org/{}";
88pub(crate) const LICENSE_DETECTION_TIMEOUT_MESSAGE: &str = "license detection timed out";
89
90pub(crate) use detection::{
91 LicenseDetection, group_matches_by_region, post_process_detections, sort_matches_by_line,
92};
93pub use models::LicenseMatch;
94
95pub use aho_match::aho_match;
96pub use hash_match::hash_match;
97pub use match_refine::{
98 filter_invalid_contained_unknown_matches, merge_overlapping_matches, refine_matches,
99 refine_matches_without_false_positive_filter, split_weak_matches,
100};
101pub use position_set::PositionSet;
102pub use spdx_lid::spdx_lid_match;
103pub use token_multiset::TokenMultiset;
104pub use token_set::TokenSet;
105pub use unknown_match::unknown_match;
106
107use self::seq_match::{
108 MAX_NEAR_DUPE_CANDIDATES, select_seq_candidates_with_deadline,
109 seq_match_with_candidates_and_deadline,
110};
111
112#[derive(Debug, Clone)]
118pub struct LicenseDetectionEngine {
119 index: Arc<index::LicenseIndex>,
120 spdx_mapping: SpdxMapping,
121 spdx_license_list_version: Option<String>,
122 license_index_provenance: Option<LicenseIndexProvenance>,
123}
124
125const MAX_DETECTION_SIZE: usize = 10 * 1024 * 1024; const MAX_REGULAR_SEQ_CANDIDATES: usize = 70;
127const MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP: usize = 8;
128const MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP: usize = 2;
129
130pub(crate) fn deadline_exceeded(deadline: Option<Instant>) -> bool {
131 deadline.is_some_and(|deadline| Instant::now() >= deadline)
132}
133
134pub(crate) fn ensure_within_deadline(deadline: Option<Instant>) -> Result<()> {
135 if deadline_exceeded(deadline) {
136 Err(anyhow::anyhow!(LICENSE_DETECTION_TIMEOUT_MESSAGE))
137 } else {
138 Ok(())
139 }
140}
141
142fn truncate_detection_text(clean_text: &str) -> &str {
143 if clean_text.len() <= MAX_DETECTION_SIZE {
144 return clean_text;
145 }
146
147 log::debug!(
148 "Content size {} exceeds limit {}, truncating for detection",
149 clean_text.len(),
150 MAX_DETECTION_SIZE
151 );
152
153 let boundary = clean_text.floor_char_boundary(MAX_DETECTION_SIZE);
154 &clean_text[..boundary]
155}
156
157fn query_span_for_match(m: &LicenseMatch) -> Option<models::PositionSpan> {
158 (!m.query_span().is_empty()).then(|| m.query_span().clone())
159}
160
161fn has_full_match_coverage(m: &LicenseMatch) -> bool {
162 m.coverage() == 100.0
163}
164
165fn is_redundant_same_expression_seq_container(
166 container: &LicenseMatch,
167 candidate_contained_matches: &[LicenseMatch],
168) -> bool {
169 let container_is_redundant_coverage =
170 has_full_match_coverage(container) || container.coverage() >= 99.0;
171 if container.matcher != MatcherKind::Seq || !container_is_redundant_coverage {
172 return false;
173 }
174
175 let container_qspan_set = container.qspan_set();
176
177 let mut contained: Vec<&LicenseMatch> = candidate_contained_matches
178 .iter()
179 .filter(|m| {
180 m.matcher == MatcherKind::Aho
181 && has_full_match_coverage(m)
182 && m.license_expression == container.license_expression
183 && m.overlaps_with(&container_qspan_set)
184 })
185 .collect();
186
187 if contained.len() < 2 {
188 return false;
189 }
190
191 let material_children = contained.iter().filter(|m| m.matched_length > 1).count();
192 if material_children < 2 {
193 return false;
194 }
195
196 contained.sort_by_key(|m| m.qspan_bounds());
197
198 let mut child_union = PositionSet::new();
199 for m in &contained {
200 child_union.extend_from_span(m.query_span());
201 }
202
203 let container_only_positions = container_qspan_set.difference(&child_union);
204 let child_only_positions = child_union.difference(&container_qspan_set);
205
206 let mut bridge_positions = BitSet::new();
207 for pair in contained.windows(2) {
208 let (_, previous_end) = pair[0].qspan_bounds();
209 let (next_start, _) = pair[1].qspan_bounds();
210
211 if next_start < previous_end {
212 return false;
213 }
214
215 for pos in previous_end..next_start {
216 bridge_positions.insert(pos);
217 }
218 }
219
220 let container_only_boundary_positions = container_only_positions
221 .iter()
222 .filter(|&pos| !bridge_positions.contains(pos))
223 .count();
224
225 if container_only_positions.len() == 1
226 && container_only_boundary_positions == 0
227 && child_only_positions.is_empty()
228 {
229 return false;
230 }
231
232 if child_only_positions.is_empty()
233 && container_only_positions.len() == container_only_boundary_positions
234 && container_only_boundary_positions <= 3
235 {
236 let earliest_child = contained
237 .iter()
238 .map(|m| m.qspan_bounds().0)
239 .min()
240 .unwrap_or(usize::MAX);
241 let latest_child = contained
242 .iter()
243 .map(|m| m.qspan_bounds().1.saturating_sub(1))
244 .max()
245 .unwrap_or(0);
246
247 let is_one_sided_boundary = container_only_positions
248 .iter()
249 .all(|pos| pos < earliest_child)
250 || container_only_positions
251 .iter()
252 .all(|pos| pos > latest_child);
253
254 if is_one_sided_boundary {
255 return false;
256 }
257 }
258
259 let max_container_only_positions =
260 MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * contained.len() + 1;
261 let max_container_boundary_positions =
262 MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * (contained.len() - 1);
263 let max_child_only_positions = MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP + 1;
264
265 container_only_positions.len() <= max_container_only_positions
266 && container_only_boundary_positions <= max_container_boundary_positions
267 && child_only_positions.len() <= max_child_only_positions
268}
269
270fn filter_redundant_same_expression_seq_containers(
271 seq_matches: Vec<LicenseMatch>,
272 candidate_contained_matches: &[LicenseMatch],
273) -> Vec<LicenseMatch> {
274 seq_matches
275 .into_iter()
276 .filter(|m| !is_redundant_same_expression_seq_container(m, candidate_contained_matches))
277 .collect()
278}
279
280fn is_redundant_low_coverage_composite_seq_wrapper(
281 container: &LicenseMatch,
282 candidate_contained_matches: &[LicenseMatch],
283) -> bool {
284 if container.matcher != seq_match::MATCH_SEQ || container.coverage() >= 30.0 {
285 return false;
286 }
287
288 let container_qspan_set = container.qspan_set();
289
290 let children: Vec<&LicenseMatch> = candidate_contained_matches
291 .iter()
292 .filter(|m| {
293 m.matcher == aho_match::MATCH_AHO
294 && has_full_match_coverage(m)
295 && m.license_expression != container.license_expression
296 && m.overlaps_with(&container_qspan_set)
297 })
298 .collect();
299
300 if children.len() < 2 {
301 return false;
302 }
303
304 let unique_expressions: HashSet<&str> = children
305 .iter()
306 .map(|m| m.license_expression.as_str())
307 .collect();
308 if unique_expressions.len() < 2 {
309 return false;
310 }
311
312 let mut child_union = PositionSet::new();
313 for m in &children {
314 child_union.extend_from_span(m.query_span());
315 }
316
317 let container_only_positions = container_qspan_set.difference(&child_union);
318 let child_only_positions = child_union.difference(&container_qspan_set);
319
320 let mut sorted_children = children;
321 sorted_children.sort_by_key(|m| m.qspan_bounds());
322
323 let mut bridge_positions = BitSet::new();
324 for pair in sorted_children.windows(2) {
325 let (_, previous_end) = pair[0].qspan_bounds();
326 let (next_start, _) = pair[1].qspan_bounds();
327 for pos in previous_end..next_start {
328 bridge_positions.insert(pos);
329 }
330 }
331
332 let container_only_boundary_positions = container_only_positions
333 .iter()
334 .filter(|&pos| !bridge_positions.contains(pos))
335 .count();
336
337 child_only_positions.is_empty()
338 && container_only_positions.len() <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
339 && container_only_boundary_positions <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
340}
341
342fn filter_redundant_low_coverage_composite_seq_wrappers(
343 seq_matches: Vec<LicenseMatch>,
344 candidate_contained_matches: &[LicenseMatch],
345) -> Vec<LicenseMatch> {
346 seq_matches
347 .into_iter()
348 .filter(|m| {
349 !is_redundant_low_coverage_composite_seq_wrapper(m, candidate_contained_matches)
350 })
351 .collect()
352}
353
354fn subtract_spdx_match_qspans(
355 query: &mut Query<'_>,
356 matched_qspans: &mut Vec<models::PositionSpan>,
357 aho_extra_matchables: &mut PositionSet,
358 spdx_matches: &[LicenseMatch],
359) {
360 for m in spdx_matches {
361 let Some(span) = query_span_for_match(m) else {
362 continue;
363 };
364
365 aho_extra_matchables.extend_from_span(&span);
366 query.subtract(&span);
367
368 if has_full_match_coverage(m) {
369 matched_qspans.push(span);
370 }
371 }
372}
373
374fn merge_and_prepare_aho_matches(
375 index: &index::LicenseIndex,
376 query: &mut Query<'_>,
377 matched_qspans: &mut Vec<models::PositionSpan>,
378 refined_aho: &[LicenseMatch],
379) -> (Vec<LicenseMatch>, bool) {
380 let merged_aho = merge_overlapping_matches(refined_aho);
381 let mut saw_long_exact_license_text_match = false;
382
383 for m in &merged_aho {
384 let Some(span) = query_span_for_match(m) else {
385 continue;
386 };
387
388 if has_full_match_coverage(m) {
389 matched_qspans.push(span.clone());
390 }
391
392 if index
393 .rules_by_rid
394 .get(m.rid)
395 .is_some_and(|rule| rule.is_license_text())
396 && m.rule_length > 120
397 && m.coverage() > 98.0
398 {
399 query.subtract(&span);
400 saw_long_exact_license_text_match = true;
401 }
402 }
403
404 (merged_aho, saw_long_exact_license_text_match)
405}
406
407fn collect_whole_query_exact_followup_matches(
408 index: &index::LicenseIndex,
409 query: &mut Query<'_>,
410 matched_qspans: &mut Vec<models::PositionSpan>,
411 whole_run: &query::QueryRun<'_>,
412 deadline: Option<Instant>,
413) -> Result<Vec<LicenseMatch>> {
414 let mut seq_all_matches = Vec::new();
415
416 if whole_run.is_matchable(false, matched_qspans) {
417 let near_dupe_candidates = if deadline.is_some() {
418 select_seq_candidates_with_deadline(
419 index,
420 whole_run,
421 true,
422 MAX_NEAR_DUPE_CANDIDATES,
423 deadline,
424 )?
425 } else {
426 self::seq_match::select_seq_candidates(index, whole_run, true, MAX_NEAR_DUPE_CANDIDATES)
427 };
428
429 if !near_dupe_candidates.is_empty() {
430 let near_dupe_matches = if deadline.is_some() {
431 seq_match_with_candidates_and_deadline(
432 index,
433 whole_run,
434 &near_dupe_candidates,
435 deadline,
436 )?
437 } else {
438 self::seq_match::seq_match_with_candidates(index, whole_run, &near_dupe_candidates)
439 };
440
441 for m in &near_dupe_matches {
442 if !m.query_span().is_empty() {
443 let span = m.query_span().clone();
444 query.subtract(&span);
445 matched_qspans.push(span);
446 }
447 }
448
449 seq_all_matches.extend(near_dupe_matches);
450 }
451 }
452
453 Ok(seq_all_matches)
454}
455
456fn collect_regular_seq_matches(
457 index: &index::LicenseIndex,
458 query: &Query<'_>,
459 matched_qspans: &[models::PositionSpan],
460 candidate_contained_matches: &[LicenseMatch],
461 deadline: Option<Instant>,
462) -> Result<Vec<LicenseMatch>> {
463 let mut seq_all_matches = Vec::new();
464
465 for (query_run_index, query_run) in query.query_runs().into_iter().enumerate() {
466 if query_run_index % 8 == 0 {
467 ensure_within_deadline(deadline)?;
468 }
469
470 if !query_run.is_matchable(false, matched_qspans) {
471 continue;
472 }
473
474 let candidates = if deadline.is_some() {
475 select_seq_candidates_with_deadline(
476 index,
477 &query_run,
478 false,
479 MAX_REGULAR_SEQ_CANDIDATES,
480 deadline,
481 )?
482 } else {
483 self::seq_match::select_seq_candidates(
484 index,
485 &query_run,
486 false,
487 MAX_REGULAR_SEQ_CANDIDATES,
488 )
489 };
490 if !candidates.is_empty() {
491 let matches = if deadline.is_some() {
492 seq_match_with_candidates_and_deadline(index, &query_run, &candidates, deadline)?
493 } else {
494 self::seq_match::seq_match_with_candidates(index, &query_run, &candidates)
495 };
496 seq_all_matches.extend(matches);
497 }
498 }
499
500 let merged_seq = merge_overlapping_matches(&seq_all_matches);
501 let filtered_same_expression =
502 filter_redundant_same_expression_seq_containers(merged_seq, candidate_contained_matches);
503 Ok(filter_redundant_low_coverage_composite_seq_wrappers(
504 filtered_same_expression,
505 candidate_contained_matches,
506 ))
507}
508
509impl LicenseDetectionEngine {
510 fn from_index(
515 index: index::LicenseIndex,
516 spdx_license_list_version: Option<String>,
517 license_index_provenance: Option<LicenseIndexProvenance>,
518 ) -> Result<Self> {
519 let mut license_vec: Vec<_> = index.licenses_by_key.values().cloned().collect();
520 license_vec.sort_by(|a, b| a.key.cmp(&b.key));
521 let spdx_mapping = build_spdx_mapping(&license_vec);
522
523 Ok(Self {
524 index: Arc::new(index),
525 spdx_mapping,
526 spdx_license_list_version,
527 license_index_provenance,
528 })
529 }
530
531 #[cfg(test)]
532 pub(crate) fn from_test_index(index: index::LicenseIndex) -> Self {
533 Self::from_index(index, None, None).expect("test index should build license engine")
534 }
535
536 pub fn from_embedded() -> Result<Self> {
541 let cache_config =
542 LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
543 Self::from_embedded_with_cache(&cache_config)
544 }
545
546 pub fn from_embedded_with_cache(cache_config: &LicenseCacheConfig) -> Result<Self> {
561 let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
562 let fingerprint = compute_artifact_fingerprint(artifact_bytes);
563 let artifact_metadata = load_embedded_artifact_metadata_from_bytes(artifact_bytes)
564 .map_err(|e| {
565 anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
566 })?;
567 debug_assert_eq!(
568 artifact_metadata.license_index_provenance.source,
569 EMBEDDED_LICENSE_INDEX_SOURCE
570 );
571 let spdx_version = Some(artifact_metadata.spdx_license_list_version.clone());
572 let provenance = Some(artifact_metadata.license_index_provenance.clone());
573
574 if !cache_config.reindex {
575 if let Some(cached) =
576 load_cached_index(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?
577 {
578 let start = Instant::now();
579 eprintln!(
580 "License index loaded from rkyv cache in {:.2}s",
581 start.elapsed().as_secs_f64()
582 );
583 return Self::from_index(cached, spdx_version, provenance);
584 }
585 } else {
586 delete_cache(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?;
587 }
588
589 let snapshot = load_loader_snapshot_from_bytes(artifact_bytes)
590 .map_err(|e| anyhow::anyhow!("Failed to load embedded license index: {}", e))?;
591 let spdx_version = Some(snapshot.metadata.spdx_license_list_version.clone());
592 let provenance = Some(snapshot.metadata.license_index_provenance.clone());
593
594 let start = Instant::now();
595 let index = build_index_from_loaded(snapshot.rules, snapshot.licenses, false);
596 eprintln!(
597 "License index built from embedded artifact in {:.2}s",
598 start.elapsed().as_secs_f64()
599 );
600
601 let mut index = index;
602 index.spdx_license_list_version = spdx_version.clone();
603 if let Err(e) = save_cached_index(
604 cache_config,
605 LicenseCacheNamespace::Embedded,
606 &index,
607 &fingerprint,
608 ) {
609 eprintln!("Warning: failed to save license index cache: {}", e);
610 } else if let Some(size) =
611 cache_file_size(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)
612 {
613 eprintln!(
614 "License index cache saved ({:.1} MB)",
615 size as f64 / 1_048_576.0
616 );
617 }
618
619 Self::from_index(index, spdx_version, provenance)
620 }
621
622 pub fn from_directory(rules_path: &Path) -> Result<Self> {
627 let cache_config =
628 LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
629 Self::from_directory_with_cache(rules_path, &cache_config)
630 }
631
632 pub fn from_directory_with_cache(
644 rules_path: &Path,
645 cache_config: &LicenseCacheConfig,
646 ) -> Result<Self> {
647 let LoadedLicenseDataset {
648 manifest,
649 rules: loaded_rules,
650 licenses: loaded_licenses,
651 } = load_license_dataset_from_root(rules_path)?;
652
653 let fingerprint = compute_rules_fingerprint(&loaded_rules, &loaded_licenses)?;
654 let provenance = Some(LicenseIndexProvenance {
655 source: CUSTOM_LICENSE_DATASET_SOURCE.to_string(),
656 dataset_fingerprint: compute_dataset_fingerprint_string(
657 &loaded_rules,
658 &loaded_licenses,
659 )?,
660 ignored_rules: vec![],
661 ignored_licenses: vec![],
662 ignored_rules_due_to_licenses: vec![],
663 added_rules: vec![],
664 replaced_rules: vec![],
665 added_licenses: vec![],
666 replaced_licenses: vec![],
667 });
668
669 if !cache_config.reindex {
670 if let Some(cached) = load_cached_index(
671 cache_config,
672 LicenseCacheNamespace::CustomRules,
673 &fingerprint,
674 )? {
675 let start = Instant::now();
676 eprintln!(
677 "License index loaded from rkyv cache in {:.2}s",
678 start.elapsed().as_secs_f64()
679 );
680 return Self::from_index(
681 cached,
682 Some(manifest.spdx_license_list_version),
683 provenance,
684 );
685 }
686 } else {
687 delete_cache(
688 cache_config,
689 LicenseCacheNamespace::CustomRules,
690 &fingerprint,
691 )?;
692 }
693
694 let start = Instant::now();
695 let index = build_index_from_loaded(loaded_rules, loaded_licenses, false);
696 eprintln!(
697 "License index built from custom dataset in {:.2}s",
698 start.elapsed().as_secs_f64()
699 );
700
701 if let Err(e) = save_cached_index(
702 cache_config,
703 LicenseCacheNamespace::CustomRules,
704 &index,
705 &fingerprint,
706 ) {
707 eprintln!("Warning: failed to save license index cache: {}", e);
708 } else if let Some(size) = cache_file_size(
709 cache_config,
710 LicenseCacheNamespace::CustomRules,
711 &fingerprint,
712 ) {
713 eprintln!(
714 "License index cache saved ({:.1} MB)",
715 size as f64 / 1_048_576.0
716 );
717 }
718
719 Self::from_index(index, Some(manifest.spdx_license_list_version), provenance)
720 }
721
722 pub fn embedded_spdx_license_list_version() -> Result<String> {
723 let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
724 Ok(load_embedded_artifact_metadata_from_bytes(artifact_bytes)
725 .map_err(|e| {
726 anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
727 })?
728 .spdx_license_list_version)
729 }
730
731 pub fn detect_with_kind(
732 &self,
733 text: &str,
734 unknown_licenses: bool,
735 binary_derived: bool,
736 ) -> Result<Vec<LicenseDetection>> {
737 self.detect_with_kind_with_score_and_deadline(
738 text,
739 unknown_licenses,
740 binary_derived,
741 0.0,
742 None,
743 )
744 }
745
746 pub fn detect_with_kind_with_score(
747 &self,
748 text: &str,
749 unknown_licenses: bool,
750 binary_derived: bool,
751 min_score: f32,
752 ) -> Result<Vec<LicenseDetection>> {
753 self.detect_with_kind_with_score_and_deadline(
754 text,
755 unknown_licenses,
756 binary_derived,
757 min_score,
758 None,
759 )
760 }
761
762 pub(crate) fn detect_with_kind_with_score_and_deadline(
763 &self,
764 text: &str,
765 unknown_licenses: bool,
766 binary_derived: bool,
767 min_score: f32,
768 deadline: Option<Instant>,
769 ) -> Result<Vec<LicenseDetection>> {
770 ensure_within_deadline(deadline)?;
771 let clean_text = strip_utf8_bom_str(text);
772
773 let content = truncate_detection_text(clean_text);
774
775 ensure_within_deadline(deadline)?;
776 let mut query = if deadline.is_some() {
777 Query::from_extracted_text_with_deadline(
778 content,
779 &self.index,
780 binary_derived,
781 deadline,
782 )?
783 } else {
784 Query::from_extracted_text(content, &self.index, binary_derived)?
785 };
786 let whole_query_run = query.whole_query_run();
787
788 let mut all_matches = Vec::new();
789 let mut candidate_contained_matches = Vec::new();
790 let mut aho_extra_matchables = PositionSet::new();
791 let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
792
793 {
796 ensure_within_deadline(deadline)?;
797 let hash_matches = hash_match(&self.index, &whole_query_run);
798
799 if !hash_matches.is_empty() {
800 let mut matches = hash_matches;
801 sort_matches_by_line(&mut matches);
802
803 let groups = group_matches_by_region(&matches);
804 let detections: Vec<LicenseDetection> = groups
805 .iter()
806 .map(|group| {
807 let mut detection = empty_detection();
808 populate_detection_from_group_with_spdx(
809 &mut detection,
810 group,
811 &self.spdx_mapping,
812 Some(content),
813 );
814 detection
815 })
816 .collect();
817
818 return Ok(post_process_detections(detections, min_score));
819 }
820 }
821
822 {
824 ensure_within_deadline(deadline)?;
825 let spdx_matches = spdx_lid_match(&self.index, &query);
826 subtract_spdx_match_qspans(
827 &mut query,
828 &mut matched_qspans,
829 &mut aho_extra_matchables,
830 &spdx_matches,
831 );
832 all_matches.extend(spdx_matches);
833 }
834
835 {
837 ensure_within_deadline(deadline)?;
838 let aho_matches = if aho_extra_matchables.is_empty() {
839 if deadline.is_some() {
840 aho_match::aho_match_with_deadline(&self.index, &whole_query_run, deadline)?
841 } else {
842 aho_match(&self.index, &whole_query_run)
843 }
844 } else {
845 if deadline.is_some() {
846 aho_match::aho_match_with_extra_matchables(
847 &self.index,
848 &whole_query_run,
849 Some(&aho_extra_matchables),
850 deadline,
851 )?
852 } else {
853 aho_match::aho_match_with_extra_matchables(
854 &self.index,
855 &whole_query_run,
856 Some(&aho_extra_matchables),
857 None,
858 )?
859 }
860 };
861
862 let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
865 candidate_contained_matches.extend(refined_aho.clone());
866 let (merged_aho, _) = merge_and_prepare_aho_matches(
867 &self.index,
868 &mut query,
869 &mut matched_qspans,
870 &refined_aho,
871 );
872 all_matches.extend(merged_aho);
873
874 let whole_query_followup = collect_whole_query_exact_followup_matches(
875 &self.index,
876 &mut query,
877 &mut matched_qspans,
878 &whole_query_run,
879 deadline,
880 )?;
881 all_matches.extend(whole_query_followup);
882
883 let merged_seq = collect_regular_seq_matches(
884 &self.index,
885 &query,
886 &matched_qspans,
887 &candidate_contained_matches,
888 deadline,
889 )?;
890 all_matches.extend(merged_seq);
891 }
892
893 ensure_within_deadline(deadline)?;
896 let merged_matches =
897 refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
898
899 let refined_matches = if unknown_licenses {
902 let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
904
905 let unknown_matches = unknown_match(&self.index, &query, &good_matches);
907 let filtered_unknown =
908 filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
909
910 let mut all_matches = good_matches;
911 all_matches.extend(filtered_unknown);
912 all_matches.extend(weak_matches);
915 all_matches
916 } else {
917 merged_matches
918 };
919
920 ensure_within_deadline(deadline)?;
922 let refined = refine_matches(&self.index, refined_matches, &query);
923
924 let mut sorted = refined;
925 sort_matches_by_line(&mut sorted);
926
927 let groups = group_matches_by_region(&sorted);
928
929 let detections: Vec<LicenseDetection> = groups
930 .iter()
931 .map(|group| {
932 let mut detection = empty_detection();
933 populate_detection_from_group_with_spdx(
934 &mut detection,
935 group,
936 &self.spdx_mapping,
937 Some(content),
938 );
939 detection
940 })
941 .collect();
942
943 let detections = post_process_detections(detections, min_score);
944
945 ensure_within_deadline(deadline)?;
946 Ok(detections)
947 }
948
949 pub fn detect_with_kind_and_source(
950 &self,
951 text: &str,
952 unknown_licenses: bool,
953 binary_derived: bool,
954 source_path: &str,
955 ) -> Result<Vec<LicenseDetection>> {
956 self.detect_with_kind_and_source_with_deadline(
957 text,
958 unknown_licenses,
959 binary_derived,
960 source_path,
961 None,
962 )
963 }
964
965 pub(crate) fn detect_with_kind_and_source_with_deadline(
966 &self,
967 text: &str,
968 unknown_licenses: bool,
969 binary_derived: bool,
970 source_path: &str,
971 deadline: Option<Instant>,
972 ) -> Result<Vec<LicenseDetection>> {
973 let mut detections = self.detect_with_kind_with_score_and_deadline(
974 text,
975 unknown_licenses,
976 binary_derived,
977 0.0,
978 deadline,
979 )?;
980 attach_source_path_to_detections(&mut detections, source_path);
981 Ok(detections)
982 }
983
984 pub fn detect_with_kind_and_source_with_score(
985 &self,
986 text: &str,
987 unknown_licenses: bool,
988 binary_derived: bool,
989 source_path: &str,
990 min_score: f32,
991 ) -> Result<Vec<LicenseDetection>> {
992 let mut detections =
993 self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, min_score)?;
994 attach_source_path_to_detections(&mut detections, source_path);
995 Ok(detections)
996 }
997
998 pub(crate) fn detect_with_kind_and_source_with_score_and_deadline(
999 &self,
1000 text: &str,
1001 unknown_licenses: bool,
1002 binary_derived: bool,
1003 source_path: &str,
1004 min_score: f32,
1005 deadline: Option<Instant>,
1006 ) -> Result<Vec<LicenseDetection>> {
1007 let mut detections = self.detect_with_kind_with_score_and_deadline(
1008 text,
1009 unknown_licenses,
1010 binary_derived,
1011 min_score,
1012 deadline,
1013 )?;
1014 attach_source_path_to_detections(&mut detections, source_path);
1015 Ok(detections)
1016 }
1017
1018 #[cfg(any(test, feature = "golden-tests"))]
1023 pub fn detect_matches_with_kind(
1024 &self,
1025 text: &str,
1026 unknown_licenses: bool,
1027 binary_derived: bool,
1028 ) -> Result<Vec<LicenseMatch>> {
1029 let clean_text = strip_utf8_bom_str(text);
1030
1031 let content = truncate_detection_text(clean_text);
1032
1033 let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
1034 let whole_query_run = query.whole_query_run();
1035
1036 let mut all_matches = Vec::new();
1037 let mut candidate_contained_matches = Vec::new();
1038 let mut aho_extra_matchables = PositionSet::new();
1039 let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
1040
1041 {
1043 let hash_matches = hash_match(&self.index, &whole_query_run);
1044
1045 if !hash_matches.is_empty() {
1046 let mut matches = hash_matches;
1047 sort_matches_by_line(&mut matches);
1048 return Ok(matches);
1049 }
1050 }
1051
1052 {
1054 let spdx_matches = spdx_lid_match(&self.index, &query);
1055 subtract_spdx_match_qspans(
1056 &mut query,
1057 &mut matched_qspans,
1058 &mut aho_extra_matchables,
1059 &spdx_matches,
1060 );
1061 all_matches.extend(spdx_matches);
1062 }
1063
1064 {
1066 let aho_matches = if aho_extra_matchables.is_empty() {
1067 aho_match(&self.index, &whole_query_run)
1068 } else {
1069 aho_match::aho_match_with_extra_matchables(
1070 &self.index,
1071 &whole_query_run,
1072 Some(&aho_extra_matchables),
1073 None,
1074 )?
1075 };
1076 let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
1077 candidate_contained_matches.extend(refined_aho.clone());
1078 let (merged_aho, _) = merge_and_prepare_aho_matches(
1079 &self.index,
1080 &mut query,
1081 &mut matched_qspans,
1082 &refined_aho,
1083 );
1084 all_matches.extend(merged_aho);
1085
1086 let whole_query_followup = collect_whole_query_exact_followup_matches(
1087 &self.index,
1088 &mut query,
1089 &mut matched_qspans,
1090 &whole_query_run,
1091 None,
1092 )?;
1093 all_matches.extend(whole_query_followup);
1094
1095 let merged_seq = collect_regular_seq_matches(
1096 &self.index,
1097 &query,
1098 &matched_qspans,
1099 &candidate_contained_matches,
1100 None,
1101 )?;
1102 all_matches.extend(merged_seq);
1103 }
1104
1105 let merged_matches =
1107 refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
1108
1109 let refined_matches = if unknown_licenses {
1111 let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
1112 let unknown_matches = unknown_match(&self.index, &query, &good_matches);
1113 let filtered_unknown =
1114 filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
1115
1116 let mut all_matches = good_matches;
1117 all_matches.extend(filtered_unknown);
1118 all_matches.extend(weak_matches);
1119 all_matches
1120 } else {
1121 merged_matches
1122 };
1123
1124 let refined = refine_matches(&self.index, refined_matches, &query);
1126
1127 let mut sorted = refined;
1128 sort_matches_by_line(&mut sorted);
1129
1130 Ok(sorted)
1132 }
1133
1134 pub fn index(&self) -> &index::LicenseIndex {
1136 &self.index
1137 }
1138
1139 pub fn spdx_license_list_version(&self) -> Option<&str> {
1140 self.spdx_license_list_version.as_deref()
1141 }
1142
1143 pub fn license_index_provenance(&self) -> Option<&LicenseIndexProvenance> {
1144 self.license_index_provenance.as_ref()
1145 }
1146
1147 #[cfg(test)]
1149 pub fn spdx_mapping(&self) -> &SpdxMapping {
1150 &self.spdx_mapping
1151 }
1152}
1153
1154pub fn detect_scancode_spdx_license_list_version(search_path: &Path) -> Result<Option<String>> {
1155 for ancestor in search_path.ancestors() {
1156 let candidate = ancestor.join("scancode_config.py");
1157 if candidate.is_file() {
1158 let config = fs::read_to_string(&candidate)?;
1159 return Ok(parse_scancode_spdx_license_list_version(&config));
1160 }
1161 }
1162
1163 Ok(None)
1164}
1165
1166fn parse_scancode_spdx_license_list_version(config: &str) -> Option<String> {
1167 config.lines().find_map(|line| {
1168 let trimmed = line.trim();
1169 let (_, value) = trimmed.split_once('=')?;
1170 (trimmed.starts_with("spdx_license_list_version")).then(|| {
1171 value
1172 .trim()
1173 .trim_matches('"')
1174 .trim_matches('\'')
1175 .to_string()
1176 })
1177 })
1178}
1179
1180#[cfg(test)]
1181mod tests;