1pub mod aho_match;
7pub mod automaton;
8pub mod build_policy;
9pub mod dataset;
10pub(crate) mod detection;
11pub mod embedded;
12pub mod license_cache;
13mod position_set;
14mod token_multiset;
15mod token_set;
16
17#[cfg(test)]
18mod embedded_test;
19pub mod expression;
20#[cfg(feature = "golden-tests")]
21pub mod golden_utils;
22pub mod hash_match;
23pub mod index;
24mod match_refine;
25pub mod models;
26pub mod query;
27pub mod rules;
28pub mod seq_match;
29pub mod spdx_lid;
30pub mod spdx_mapping;
31#[cfg(test)]
32mod test_utils;
33pub mod tokenize;
34pub mod unknown_match;
35
36use bit_set::BitSet;
37use std::collections::HashSet;
38use std::fs;
39use std::path::Path;
40use std::sync::Arc;
41use std::time::Instant;
42
43use anyhow::Result;
44
45use crate::license_detection::build_policy::EMBEDDED_LICENSE_INDEX_SOURCE;
46use crate::license_detection::dataset::{
47 CUSTOM_LICENSE_DATASET_SOURCE, LoadedLicenseDataset, compute_dataset_fingerprint_string,
48 load_license_dataset_from_root,
49};
50use crate::license_detection::embedded::index::{
51 load_embedded_artifact_metadata_from_bytes, load_loader_snapshot_from_bytes,
52};
53use crate::license_detection::index::build_index_from_loaded;
54use crate::license_detection::license_cache::{
55 LicenseCacheConfig, LicenseCacheNamespace, cache_file_size, compute_artifact_fingerprint,
56 compute_rules_fingerprint, delete_cache, load_cached_index, save_cached_index,
57};
58use crate::license_detection::query::Query;
59use crate::license_detection::spdx_mapping::{SpdxMapping, build_spdx_mapping};
60use crate::models::LicenseIndexProvenance;
61use crate::utils::text::strip_utf8_bom_str;
62
63use crate::license_detection::detection::{
64 attach_source_path_to_detections, empty_detection, populate_detection_from_group_with_spdx,
65 split_groups_across_frontmatter_boundary,
66};
67use crate::license_detection::models::MatcherKind;
68
69#[allow(dead_code)]
72pub const SCANCODE_LICENSES_RULES_PATH: &str =
73 "reference/scancode-toolkit/src/licensedcode/data/rules";
74
75#[allow(dead_code)]
78pub const SCANCODE_LICENSES_LICENSES_PATH: &str =
79 "reference/scancode-toolkit/src/licensedcode/data/licenses";
80
81#[allow(dead_code)]
84pub const SCANCODE_LICENSES_DATA_PATH: &str = "reference/scancode-toolkit/src/licensedcode/data";
85
86pub const DEFAULT_LICENSEDB_URL_TEMPLATE: &str = "https://scancode-licensedb.aboutcode.org/{}";
87pub(crate) const LICENSE_DETECTION_TIMEOUT_MESSAGE: &str = "license detection timed out";
88
89pub(crate) use detection::{
90 LicenseDetection, group_matches_by_region, post_process_detections, sort_matches_by_line,
91};
92pub use models::LicenseMatch;
93
94pub use aho_match::aho_match;
95pub use hash_match::hash_match;
96pub use match_refine::{
97 filter_invalid_contained_unknown_matches, merge_overlapping_matches, refine_matches,
98 refine_matches_without_false_positive_filter, split_weak_matches,
99};
100pub use position_set::PositionSet;
101pub use spdx_lid::spdx_lid_match;
102pub use token_multiset::TokenMultiset;
103pub use token_set::TokenSet;
104pub use unknown_match::unknown_match;
105
106use self::seq_match::{
107 MAX_NEAR_DUPE_CANDIDATES, select_seq_candidates_with_deadline,
108 seq_match_with_candidates_and_deadline,
109};
110
111#[derive(Debug, Clone)]
117pub struct LicenseDetectionEngine {
118 index: Arc<index::LicenseIndex>,
119 spdx_mapping: SpdxMapping,
120 spdx_license_list_version: Option<String>,
121 license_index_provenance: Option<LicenseIndexProvenance>,
122}
123
124const MAX_DETECTION_SIZE: usize = 10 * 1024 * 1024; const MAX_REGULAR_SEQ_CANDIDATES: usize = 70;
126const MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP: usize = 8;
127const MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP: usize = 2;
128
129pub(crate) fn deadline_exceeded(deadline: Option<Instant>) -> bool {
130 deadline.is_some_and(|deadline| Instant::now() >= deadline)
131}
132
133pub(crate) fn ensure_within_deadline(deadline: Option<Instant>) -> Result<()> {
134 if deadline_exceeded(deadline) {
135 Err(anyhow::anyhow!(LICENSE_DETECTION_TIMEOUT_MESSAGE))
136 } else {
137 Ok(())
138 }
139}
140
141fn truncate_detection_text(clean_text: &str) -> &str {
142 if clean_text.len() <= MAX_DETECTION_SIZE {
143 return clean_text;
144 }
145
146 log::debug!(
147 "Content size {} exceeds limit {}, truncating for detection",
148 clean_text.len(),
149 MAX_DETECTION_SIZE
150 );
151
152 let boundary = clean_text.floor_char_boundary(MAX_DETECTION_SIZE);
153 &clean_text[..boundary]
154}
155
156fn query_span_for_match(m: &LicenseMatch) -> Option<models::PositionSpan> {
157 (!m.query_span().is_empty()).then(|| m.query_span().clone())
158}
159
160fn has_full_match_coverage(m: &LicenseMatch) -> bool {
161 m.coverage() == 100.0
162}
163
164fn is_redundant_same_expression_seq_container(
165 container: &LicenseMatch,
166 candidate_contained_matches: &[LicenseMatch],
167) -> bool {
168 let container_is_redundant_coverage =
169 has_full_match_coverage(container) || container.coverage() >= 99.0;
170 if container.matcher != MatcherKind::Seq || !container_is_redundant_coverage {
171 return false;
172 }
173
174 let container_qspan_set = container.qspan_set();
175
176 let mut contained: Vec<&LicenseMatch> = candidate_contained_matches
177 .iter()
178 .filter(|m| {
179 m.matcher == MatcherKind::Aho
180 && has_full_match_coverage(m)
181 && m.license_expression == container.license_expression
182 && m.overlaps_with(&container_qspan_set)
183 })
184 .collect();
185
186 if contained.len() < 2 {
187 return false;
188 }
189
190 let material_children = contained.iter().filter(|m| m.matched_length > 1).count();
191 if material_children < 2 {
192 return false;
193 }
194
195 contained.sort_by_key(|m| m.qspan_bounds());
196
197 let mut child_union = PositionSet::new();
198 for m in &contained {
199 child_union.extend_from_span(m.query_span());
200 }
201
202 let container_only_positions = container_qspan_set.difference(&child_union);
203 let child_only_positions = child_union.difference(&container_qspan_set);
204
205 let mut bridge_positions = BitSet::new();
206 for pair in contained.windows(2) {
207 let (_, previous_end) = pair[0].qspan_bounds();
208 let (next_start, _) = pair[1].qspan_bounds();
209
210 if next_start < previous_end {
211 return false;
212 }
213
214 for pos in previous_end..next_start {
215 bridge_positions.insert(pos);
216 }
217 }
218
219 let container_only_boundary_positions = container_only_positions
220 .iter()
221 .filter(|&pos| !bridge_positions.contains(pos))
222 .count();
223
224 if container_only_positions.len() == 1
225 && container_only_boundary_positions == 0
226 && child_only_positions.is_empty()
227 {
228 return false;
229 }
230
231 if child_only_positions.is_empty()
232 && container_only_positions.len() == container_only_boundary_positions
233 && container_only_boundary_positions <= 3
234 {
235 let earliest_child = contained
236 .iter()
237 .map(|m| m.qspan_bounds().0)
238 .min()
239 .unwrap_or(usize::MAX);
240 let latest_child = contained
241 .iter()
242 .map(|m| m.qspan_bounds().1.saturating_sub(1))
243 .max()
244 .unwrap_or(0);
245
246 let is_one_sided_boundary = container_only_positions
247 .iter()
248 .all(|pos| pos < earliest_child)
249 || container_only_positions
250 .iter()
251 .all(|pos| pos > latest_child);
252
253 if is_one_sided_boundary {
254 return false;
255 }
256 }
257
258 let max_container_only_positions =
259 MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * contained.len() + 1;
260 let max_container_boundary_positions =
261 MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * (contained.len() - 1);
262 let max_child_only_positions = MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP + 1;
263
264 container_only_positions.len() <= max_container_only_positions
265 && container_only_boundary_positions <= max_container_boundary_positions
266 && child_only_positions.len() <= max_child_only_positions
267}
268
269fn filter_redundant_same_expression_seq_containers(
270 seq_matches: Vec<LicenseMatch>,
271 candidate_contained_matches: &[LicenseMatch],
272) -> Vec<LicenseMatch> {
273 seq_matches
274 .into_iter()
275 .filter(|m| !is_redundant_same_expression_seq_container(m, candidate_contained_matches))
276 .collect()
277}
278
279fn is_redundant_low_coverage_composite_seq_wrapper(
280 container: &LicenseMatch,
281 candidate_contained_matches: &[LicenseMatch],
282) -> bool {
283 if container.matcher != seq_match::MATCH_SEQ || container.coverage() >= 30.0 {
284 return false;
285 }
286
287 let container_qspan_set = container.qspan_set();
288
289 let children: Vec<&LicenseMatch> = candidate_contained_matches
290 .iter()
291 .filter(|m| {
292 m.matcher == aho_match::MATCH_AHO
293 && has_full_match_coverage(m)
294 && m.license_expression != container.license_expression
295 && m.overlaps_with(&container_qspan_set)
296 })
297 .collect();
298
299 if children.len() < 2 {
300 return false;
301 }
302
303 let unique_expressions: HashSet<&str> = children
304 .iter()
305 .map(|m| m.license_expression.as_str())
306 .collect();
307 if unique_expressions.len() < 2 {
308 return false;
309 }
310
311 let mut child_union = PositionSet::new();
312 for m in &children {
313 child_union.extend_from_span(m.query_span());
314 }
315
316 let container_only_positions = container_qspan_set.difference(&child_union);
317 let child_only_positions = child_union.difference(&container_qspan_set);
318
319 let mut sorted_children = children;
320 sorted_children.sort_by_key(|m| m.qspan_bounds());
321
322 let mut bridge_positions = BitSet::new();
323 for pair in sorted_children.windows(2) {
324 let (_, previous_end) = pair[0].qspan_bounds();
325 let (next_start, _) = pair[1].qspan_bounds();
326 for pos in previous_end..next_start {
327 bridge_positions.insert(pos);
328 }
329 }
330
331 let container_only_boundary_positions = container_only_positions
332 .iter()
333 .filter(|&pos| !bridge_positions.contains(pos))
334 .count();
335
336 child_only_positions.is_empty()
337 && container_only_positions.len() <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
338 && container_only_boundary_positions <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
339}
340
341fn filter_redundant_low_coverage_composite_seq_wrappers(
342 seq_matches: Vec<LicenseMatch>,
343 candidate_contained_matches: &[LicenseMatch],
344) -> Vec<LicenseMatch> {
345 seq_matches
346 .into_iter()
347 .filter(|m| {
348 !is_redundant_low_coverage_composite_seq_wrapper(m, candidate_contained_matches)
349 })
350 .collect()
351}
352
353fn subtract_spdx_match_qspans(
354 query: &mut Query<'_>,
355 matched_qspans: &mut Vec<models::PositionSpan>,
356 aho_extra_matchables: &mut PositionSet,
357 spdx_matches: &[LicenseMatch],
358) {
359 for m in spdx_matches {
360 let Some(span) = query_span_for_match(m) else {
361 continue;
362 };
363
364 aho_extra_matchables.extend_from_span(&span);
365 query.subtract(&span);
366
367 if has_full_match_coverage(m) {
368 matched_qspans.push(span);
369 }
370 }
371}
372
373fn merge_and_prepare_aho_matches(
374 index: &index::LicenseIndex,
375 query: &mut Query<'_>,
376 matched_qspans: &mut Vec<models::PositionSpan>,
377 refined_aho: &[LicenseMatch],
378) -> (Vec<LicenseMatch>, bool) {
379 let merged_aho = merge_overlapping_matches(refined_aho);
380 let mut saw_long_exact_license_text_match = false;
381
382 for m in &merged_aho {
383 let Some(span) = query_span_for_match(m) else {
384 continue;
385 };
386
387 if has_full_match_coverage(m) {
388 matched_qspans.push(span.clone());
389 }
390
391 if index.rule(m.rid).is_some_and(|rule| rule.is_license_text())
392 && m.rule_length > 120
393 && m.coverage() > 98.0
394 {
395 query.subtract(&span);
396 saw_long_exact_license_text_match = true;
397 }
398 }
399
400 (merged_aho, saw_long_exact_license_text_match)
401}
402
403fn collect_whole_query_exact_followup_matches(
404 index: &index::LicenseIndex,
405 query: &mut Query<'_>,
406 matched_qspans: &mut Vec<models::PositionSpan>,
407 whole_run: &query::QueryRun<'_>,
408 deadline: Option<Instant>,
409) -> Result<Vec<LicenseMatch>> {
410 let mut seq_all_matches = Vec::new();
411
412 if whole_run.is_matchable(false, matched_qspans) {
413 let near_dupe_candidates = if deadline.is_some() {
414 select_seq_candidates_with_deadline(
415 index,
416 whole_run,
417 true,
418 MAX_NEAR_DUPE_CANDIDATES,
419 deadline,
420 )?
421 } else {
422 self::seq_match::select_seq_candidates(index, whole_run, true, MAX_NEAR_DUPE_CANDIDATES)
423 };
424
425 if !near_dupe_candidates.is_empty() {
426 let near_dupe_matches = if deadline.is_some() {
427 seq_match_with_candidates_and_deadline(
428 index,
429 whole_run,
430 &near_dupe_candidates,
431 deadline,
432 )?
433 } else {
434 self::seq_match::seq_match_with_candidates(index, whole_run, &near_dupe_candidates)
435 };
436
437 for m in &near_dupe_matches {
438 if !m.query_span().is_empty() {
439 let span = m.query_span().clone();
440 query.subtract(&span);
441 matched_qspans.push(span);
442 }
443 }
444
445 seq_all_matches.extend(near_dupe_matches);
446 }
447 }
448
449 Ok(seq_all_matches)
450}
451
452fn collect_regular_seq_matches(
453 index: &index::LicenseIndex,
454 query: &Query<'_>,
455 matched_qspans: &[models::PositionSpan],
456 candidate_contained_matches: &[LicenseMatch],
457 deadline: Option<Instant>,
458) -> Result<Vec<LicenseMatch>> {
459 let mut seq_all_matches = Vec::new();
460
461 for (query_run_index, query_run) in query.query_runs().into_iter().enumerate() {
462 if query_run_index % 8 == 0 {
463 ensure_within_deadline(deadline)?;
464 }
465
466 if !query_run.is_matchable(false, matched_qspans) {
467 continue;
468 }
469
470 let candidates = if deadline.is_some() {
471 select_seq_candidates_with_deadline(
472 index,
473 &query_run,
474 false,
475 MAX_REGULAR_SEQ_CANDIDATES,
476 deadline,
477 )?
478 } else {
479 self::seq_match::select_seq_candidates(
480 index,
481 &query_run,
482 false,
483 MAX_REGULAR_SEQ_CANDIDATES,
484 )
485 };
486 if !candidates.is_empty() {
487 let matches = if deadline.is_some() {
488 seq_match_with_candidates_and_deadline(index, &query_run, &candidates, deadline)?
489 } else {
490 self::seq_match::seq_match_with_candidates(index, &query_run, &candidates)
491 };
492 seq_all_matches.extend(matches);
493 }
494 }
495
496 let merged_seq = merge_overlapping_matches(&seq_all_matches);
497 let filtered_same_expression =
498 filter_redundant_same_expression_seq_containers(merged_seq, candidate_contained_matches);
499 Ok(filter_redundant_low_coverage_composite_seq_wrappers(
500 filtered_same_expression,
501 candidate_contained_matches,
502 ))
503}
504
505impl LicenseDetectionEngine {
506 fn from_index(
511 index: index::LicenseIndex,
512 spdx_license_list_version: Option<String>,
513 license_index_provenance: Option<LicenseIndexProvenance>,
514 ) -> Result<Self> {
515 let mut license_vec: Vec<_> = index.licenses_by_key.values().cloned().collect();
516 license_vec.sort_by(|a, b| a.key.cmp(&b.key));
517 let spdx_mapping = build_spdx_mapping(&license_vec);
518
519 Ok(Self {
520 index: Arc::new(index),
521 spdx_mapping,
522 spdx_license_list_version,
523 license_index_provenance,
524 })
525 }
526
527 #[cfg(test)]
528 pub(crate) fn from_test_index(index: index::LicenseIndex) -> Self {
529 Self::from_index(index, None, None).expect("test index should build license engine")
530 }
531
532 pub fn from_embedded() -> Result<Self> {
537 let cache_config =
538 LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
539 Self::from_embedded_with_cache(&cache_config)
540 }
541
542 pub fn from_embedded_with_cache(cache_config: &LicenseCacheConfig) -> Result<Self> {
557 let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
558 let fingerprint = compute_artifact_fingerprint(artifact_bytes);
559 let artifact_metadata = load_embedded_artifact_metadata_from_bytes(artifact_bytes)
560 .map_err(|e| {
561 anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
562 })?;
563 debug_assert_eq!(
564 artifact_metadata.license_index_provenance.source,
565 EMBEDDED_LICENSE_INDEX_SOURCE
566 );
567 let spdx_version = Some(artifact_metadata.spdx_license_list_version.clone());
568 let provenance = Some(artifact_metadata.license_index_provenance.clone());
569
570 if !cache_config.reindex {
571 if let Some(cached) =
572 load_cached_index(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?
573 {
574 let start = Instant::now();
575 eprintln!(
576 "License index loaded from rkyv cache in {:.2}s",
577 start.elapsed().as_secs_f64()
578 );
579 return Self::from_index(cached, spdx_version, provenance);
580 }
581 } else {
582 delete_cache(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?;
583 }
584
585 let snapshot = load_loader_snapshot_from_bytes(artifact_bytes)
586 .map_err(|e| anyhow::anyhow!("Failed to load embedded license index: {}", e))?;
587 let spdx_version = Some(snapshot.metadata.spdx_license_list_version.clone());
588 let provenance = Some(snapshot.metadata.license_index_provenance.clone());
589
590 let start = Instant::now();
591 let index = build_index_from_loaded(snapshot.rules, snapshot.licenses, false);
592 eprintln!(
593 "License index built from embedded artifact in {:.2}s",
594 start.elapsed().as_secs_f64()
595 );
596
597 let mut index = index;
598 index.spdx_license_list_version = spdx_version.clone();
599 if let Err(e) = save_cached_index(
600 cache_config,
601 LicenseCacheNamespace::Embedded,
602 &index,
603 &fingerprint,
604 ) {
605 eprintln!("Warning: failed to save license index cache: {}", e);
606 } else if let Some(size) =
607 cache_file_size(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)
608 {
609 eprintln!(
610 "License index cache saved ({:.1} MB)",
611 size as f64 / 1_048_576.0
612 );
613 }
614
615 Self::from_index(index, spdx_version, provenance)
616 }
617
618 pub fn from_directory(rules_path: &Path) -> Result<Self> {
623 let cache_config =
624 LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
625 Self::from_directory_with_cache(rules_path, &cache_config)
626 }
627
628 pub fn from_directory_with_cache(
640 rules_path: &Path,
641 cache_config: &LicenseCacheConfig,
642 ) -> Result<Self> {
643 let LoadedLicenseDataset {
644 manifest,
645 rules: loaded_rules,
646 licenses: loaded_licenses,
647 } = load_license_dataset_from_root(rules_path)?;
648
649 let fingerprint = compute_rules_fingerprint(&loaded_rules, &loaded_licenses)?;
650 let provenance = Some(LicenseIndexProvenance {
651 source: CUSTOM_LICENSE_DATASET_SOURCE.to_string(),
652 dataset_fingerprint: compute_dataset_fingerprint_string(
653 &loaded_rules,
654 &loaded_licenses,
655 )?,
656 ignored_rules: vec![],
657 ignored_licenses: vec![],
658 ignored_rules_due_to_licenses: vec![],
659 added_rules: vec![],
660 replaced_rules: vec![],
661 added_licenses: vec![],
662 replaced_licenses: vec![],
663 });
664
665 if !cache_config.reindex {
666 if let Some(cached) = load_cached_index(
667 cache_config,
668 LicenseCacheNamespace::CustomRules,
669 &fingerprint,
670 )? {
671 let start = Instant::now();
672 eprintln!(
673 "License index loaded from rkyv cache in {:.2}s",
674 start.elapsed().as_secs_f64()
675 );
676 return Self::from_index(
677 cached,
678 Some(manifest.spdx_license_list_version),
679 provenance,
680 );
681 }
682 } else {
683 delete_cache(
684 cache_config,
685 LicenseCacheNamespace::CustomRules,
686 &fingerprint,
687 )?;
688 }
689
690 let start = Instant::now();
691 let index = build_index_from_loaded(loaded_rules, loaded_licenses, false);
692 eprintln!(
693 "License index built from custom dataset in {:.2}s",
694 start.elapsed().as_secs_f64()
695 );
696
697 if let Err(e) = save_cached_index(
698 cache_config,
699 LicenseCacheNamespace::CustomRules,
700 &index,
701 &fingerprint,
702 ) {
703 eprintln!("Warning: failed to save license index cache: {}", e);
704 } else if let Some(size) = cache_file_size(
705 cache_config,
706 LicenseCacheNamespace::CustomRules,
707 &fingerprint,
708 ) {
709 eprintln!(
710 "License index cache saved ({:.1} MB)",
711 size as f64 / 1_048_576.0
712 );
713 }
714
715 Self::from_index(index, Some(manifest.spdx_license_list_version), provenance)
716 }
717
718 pub fn embedded_spdx_license_list_version() -> Result<String> {
719 let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
720 Ok(load_embedded_artifact_metadata_from_bytes(artifact_bytes)
721 .map_err(|e| {
722 anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
723 })?
724 .spdx_license_list_version)
725 }
726
727 pub fn detect_with_kind(
728 &self,
729 text: &str,
730 unknown_licenses: bool,
731 binary_derived: bool,
732 ) -> Result<Vec<LicenseDetection>> {
733 self.detect_with_kind_with_score_and_deadline(
734 text,
735 unknown_licenses,
736 binary_derived,
737 0.0,
738 None,
739 )
740 }
741
742 pub fn detect_with_kind_with_score(
743 &self,
744 text: &str,
745 unknown_licenses: bool,
746 binary_derived: bool,
747 min_score: f32,
748 ) -> Result<Vec<LicenseDetection>> {
749 self.detect_with_kind_with_score_and_deadline(
750 text,
751 unknown_licenses,
752 binary_derived,
753 min_score,
754 None,
755 )
756 }
757
758 pub(crate) fn detect_with_kind_with_score_and_deadline(
759 &self,
760 text: &str,
761 unknown_licenses: bool,
762 binary_derived: bool,
763 min_score: f32,
764 deadline: Option<Instant>,
765 ) -> Result<Vec<LicenseDetection>> {
766 ensure_within_deadline(deadline)?;
767 let clean_text = strip_utf8_bom_str(text);
768
769 let content = truncate_detection_text(clean_text);
770
771 ensure_within_deadline(deadline)?;
772 let mut query = if deadline.is_some() {
773 Query::from_extracted_text_with_deadline(
774 content,
775 &self.index,
776 binary_derived,
777 deadline,
778 )?
779 } else {
780 Query::from_extracted_text(content, &self.index, binary_derived)?
781 };
782 let whole_query_run = query.whole_query_run();
783
784 let mut all_matches = Vec::new();
785 let mut candidate_contained_matches = Vec::new();
786 let mut aho_extra_matchables = PositionSet::new();
787 let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
788
789 {
792 ensure_within_deadline(deadline)?;
793 let hash_matches = hash_match(&self.index, &whole_query_run);
794
795 if !hash_matches.is_empty() {
796 let mut matches = hash_matches;
797 sort_matches_by_line(&mut matches);
798
799 let groups = split_groups_across_frontmatter_boundary(
800 group_matches_by_region(&matches),
801 Some(content),
802 );
803 let detections: Vec<LicenseDetection> = groups
804 .iter()
805 .map(|group| {
806 let mut detection = empty_detection();
807 populate_detection_from_group_with_spdx(
808 &mut detection,
809 group,
810 &self.spdx_mapping,
811 Some(content),
812 );
813 detection
814 })
815 .collect();
816
817 return Ok(post_process_detections(detections, min_score));
818 }
819 }
820
821 {
823 ensure_within_deadline(deadline)?;
824 let spdx_matches = spdx_lid_match(&self.index, &query);
825 subtract_spdx_match_qspans(
826 &mut query,
827 &mut matched_qspans,
828 &mut aho_extra_matchables,
829 &spdx_matches,
830 );
831 all_matches.extend(spdx_matches);
832 }
833
834 {
836 ensure_within_deadline(deadline)?;
837 let aho_matches = if aho_extra_matchables.is_empty() {
838 if deadline.is_some() {
839 aho_match::aho_match_with_deadline(&self.index, &whole_query_run, deadline)?
840 } else {
841 aho_match(&self.index, &whole_query_run)
842 }
843 } else {
844 if deadline.is_some() {
845 aho_match::aho_match_with_extra_matchables(
846 &self.index,
847 &whole_query_run,
848 Some(&aho_extra_matchables),
849 deadline,
850 )?
851 } else {
852 aho_match::aho_match_with_extra_matchables(
853 &self.index,
854 &whole_query_run,
855 Some(&aho_extra_matchables),
856 None,
857 )?
858 }
859 };
860
861 let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
864 candidate_contained_matches.extend(refined_aho.clone());
865 let (merged_aho, _) = merge_and_prepare_aho_matches(
866 &self.index,
867 &mut query,
868 &mut matched_qspans,
869 &refined_aho,
870 );
871 all_matches.extend(merged_aho);
872
873 let whole_query_followup = collect_whole_query_exact_followup_matches(
874 &self.index,
875 &mut query,
876 &mut matched_qspans,
877 &whole_query_run,
878 deadline,
879 )?;
880 all_matches.extend(whole_query_followup);
881
882 let merged_seq = collect_regular_seq_matches(
883 &self.index,
884 &query,
885 &matched_qspans,
886 &candidate_contained_matches,
887 deadline,
888 )?;
889 all_matches.extend(merged_seq);
890 }
891
892 ensure_within_deadline(deadline)?;
895 let merged_matches =
896 refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
897
898 let refined_matches = if unknown_licenses {
901 let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
903
904 let unknown_matches = unknown_match(&self.index, &query, &good_matches);
906 let filtered_unknown =
907 filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
908
909 let mut all_matches = good_matches;
910 all_matches.extend(filtered_unknown);
911 all_matches.extend(weak_matches);
914 all_matches
915 } else {
916 merged_matches
917 };
918
919 ensure_within_deadline(deadline)?;
921 let refined = refine_matches(&self.index, refined_matches, &query);
922
923 let mut sorted = refined;
924 sort_matches_by_line(&mut sorted);
925
926 let groups = split_groups_across_frontmatter_boundary(
927 group_matches_by_region(&sorted),
928 Some(content),
929 );
930
931 let detections: Vec<LicenseDetection> = groups
932 .iter()
933 .map(|group| {
934 let mut detection = empty_detection();
935 populate_detection_from_group_with_spdx(
936 &mut detection,
937 group,
938 &self.spdx_mapping,
939 Some(content),
940 );
941 detection
942 })
943 .collect();
944
945 let detections = post_process_detections(detections, min_score);
946
947 ensure_within_deadline(deadline)?;
948 Ok(detections)
949 }
950
951 pub fn detect_with_kind_and_source(
952 &self,
953 text: &str,
954 unknown_licenses: bool,
955 binary_derived: bool,
956 source_path: &str,
957 ) -> Result<Vec<LicenseDetection>> {
958 self.detect_with_kind_and_source_with_deadline(
959 text,
960 unknown_licenses,
961 binary_derived,
962 source_path,
963 None,
964 )
965 }
966
967 pub(crate) fn detect_with_kind_and_source_with_deadline(
968 &self,
969 text: &str,
970 unknown_licenses: bool,
971 binary_derived: bool,
972 source_path: &str,
973 deadline: Option<Instant>,
974 ) -> Result<Vec<LicenseDetection>> {
975 let mut detections = self.detect_with_kind_with_score_and_deadline(
976 text,
977 unknown_licenses,
978 binary_derived,
979 0.0,
980 deadline,
981 )?;
982 attach_source_path_to_detections(&mut detections, source_path);
983 Ok(detections)
984 }
985
986 pub fn detect_with_kind_and_source_with_score(
987 &self,
988 text: &str,
989 unknown_licenses: bool,
990 binary_derived: bool,
991 source_path: &str,
992 min_score: f32,
993 ) -> Result<Vec<LicenseDetection>> {
994 let mut detections =
995 self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, min_score)?;
996 attach_source_path_to_detections(&mut detections, source_path);
997 Ok(detections)
998 }
999
1000 pub(crate) fn detect_with_kind_and_source_with_score_and_deadline(
1001 &self,
1002 text: &str,
1003 unknown_licenses: bool,
1004 binary_derived: bool,
1005 source_path: &str,
1006 min_score: f32,
1007 deadline: Option<Instant>,
1008 ) -> Result<Vec<LicenseDetection>> {
1009 let mut detections = self.detect_with_kind_with_score_and_deadline(
1010 text,
1011 unknown_licenses,
1012 binary_derived,
1013 min_score,
1014 deadline,
1015 )?;
1016 attach_source_path_to_detections(&mut detections, source_path);
1017 Ok(detections)
1018 }
1019
1020 #[cfg(any(test, feature = "golden-tests"))]
1025 pub fn detect_matches_with_kind(
1026 &self,
1027 text: &str,
1028 unknown_licenses: bool,
1029 binary_derived: bool,
1030 ) -> Result<Vec<LicenseMatch>> {
1031 let clean_text = strip_utf8_bom_str(text);
1032
1033 let content = truncate_detection_text(clean_text);
1034
1035 let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
1036 let whole_query_run = query.whole_query_run();
1037
1038 let mut all_matches = Vec::new();
1039 let mut candidate_contained_matches = Vec::new();
1040 let mut aho_extra_matchables = PositionSet::new();
1041 let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
1042
1043 {
1045 let hash_matches = hash_match(&self.index, &whole_query_run);
1046
1047 if !hash_matches.is_empty() {
1048 let mut matches = hash_matches;
1049 sort_matches_by_line(&mut matches);
1050 return Ok(matches);
1051 }
1052 }
1053
1054 {
1056 let spdx_matches = spdx_lid_match(&self.index, &query);
1057 subtract_spdx_match_qspans(
1058 &mut query,
1059 &mut matched_qspans,
1060 &mut aho_extra_matchables,
1061 &spdx_matches,
1062 );
1063 all_matches.extend(spdx_matches);
1064 }
1065
1066 {
1068 let aho_matches = if aho_extra_matchables.is_empty() {
1069 aho_match(&self.index, &whole_query_run)
1070 } else {
1071 aho_match::aho_match_with_extra_matchables(
1072 &self.index,
1073 &whole_query_run,
1074 Some(&aho_extra_matchables),
1075 None,
1076 )?
1077 };
1078 let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
1079 candidate_contained_matches.extend(refined_aho.clone());
1080 let (merged_aho, _) = merge_and_prepare_aho_matches(
1081 &self.index,
1082 &mut query,
1083 &mut matched_qspans,
1084 &refined_aho,
1085 );
1086 all_matches.extend(merged_aho);
1087
1088 let whole_query_followup = collect_whole_query_exact_followup_matches(
1089 &self.index,
1090 &mut query,
1091 &mut matched_qspans,
1092 &whole_query_run,
1093 None,
1094 )?;
1095 all_matches.extend(whole_query_followup);
1096
1097 let merged_seq = collect_regular_seq_matches(
1098 &self.index,
1099 &query,
1100 &matched_qspans,
1101 &candidate_contained_matches,
1102 None,
1103 )?;
1104 all_matches.extend(merged_seq);
1105 }
1106
1107 let merged_matches =
1109 refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
1110
1111 let refined_matches = if unknown_licenses {
1113 let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
1114 let unknown_matches = unknown_match(&self.index, &query, &good_matches);
1115 let filtered_unknown =
1116 filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
1117
1118 let mut all_matches = good_matches;
1119 all_matches.extend(filtered_unknown);
1120 all_matches.extend(weak_matches);
1121 all_matches
1122 } else {
1123 merged_matches
1124 };
1125
1126 let refined = refine_matches(&self.index, refined_matches, &query);
1128
1129 let mut sorted = refined;
1130 sort_matches_by_line(&mut sorted);
1131
1132 Ok(sorted)
1134 }
1135
1136 pub fn index(&self) -> &index::LicenseIndex {
1138 &self.index
1139 }
1140
1141 pub fn spdx_license_list_version(&self) -> Option<&str> {
1142 self.spdx_license_list_version.as_deref()
1143 }
1144
1145 pub fn license_index_provenance(&self) -> Option<&LicenseIndexProvenance> {
1146 self.license_index_provenance.as_ref()
1147 }
1148
1149 #[cfg(test)]
1151 pub fn spdx_mapping(&self) -> &SpdxMapping {
1152 &self.spdx_mapping
1153 }
1154}
1155
1156pub fn detect_scancode_spdx_license_list_version(search_path: &Path) -> Result<Option<String>> {
1157 for ancestor in search_path.ancestors() {
1158 let candidate = ancestor.join("scancode_config.py");
1159 if candidate.is_file() {
1160 let config = fs::read_to_string(&candidate)?;
1161 return Ok(parse_scancode_spdx_license_list_version(&config));
1162 }
1163 }
1164
1165 Ok(None)
1166}
1167
1168fn parse_scancode_spdx_license_list_version(config: &str) -> Option<String> {
1169 config.lines().find_map(|line| {
1170 let trimmed = line.trim();
1171 let (_, value) = trimmed.split_once('=')?;
1172 (trimmed.starts_with("spdx_license_list_version")).then(|| {
1173 value
1174 .trim()
1175 .trim_matches('"')
1176 .trim_matches('\'')
1177 .to_string()
1178 })
1179 })
1180}
1181
1182#[cfg(test)]
1183mod tests;