1pub mod aho_match;
7pub mod automaton;
8pub mod build_policy;
9pub mod dataset;
10pub(crate) mod detection;
11pub mod embedded;
12pub mod license_cache;
13mod position_set;
14mod token_multiset;
15mod token_set;
16
17#[cfg(test)]
18mod embedded_test;
19pub mod expression;
20#[cfg(feature = "golden-tests")]
21pub mod golden_utils;
22pub mod hash_match;
23pub mod index;
24mod match_refine;
25pub mod models;
26pub mod query;
27pub mod rules;
28pub mod seq_match;
29pub mod spdx_lid;
30pub mod spdx_mapping;
31#[cfg(test)]
32mod test_utils;
33pub mod tokenize;
34pub mod unknown_match;
35
36use bit_set::BitSet;
37use std::collections::HashSet;
38use std::fs;
39use std::path::Path;
40use std::sync::Arc;
41use std::time::Instant;
42
43use anyhow::Result;
44
45use crate::license_detection::build_policy::EMBEDDED_LICENSE_INDEX_SOURCE;
46use crate::license_detection::dataset::{
47 CUSTOM_LICENSE_DATASET_SOURCE, LoadedLicenseDataset, compute_dataset_fingerprint_string,
48 load_license_dataset_from_root,
49};
50use crate::license_detection::embedded::index::{
51 load_embedded_artifact_metadata_from_bytes, load_loader_snapshot_from_bytes,
52};
53use crate::license_detection::index::build_index_from_loaded;
54use crate::license_detection::license_cache::{
55 LicenseCacheConfig, LicenseCacheNamespace, cache_file_size, compute_artifact_fingerprint,
56 compute_rules_fingerprint, delete_cache, load_cached_index, save_cached_index,
57};
58use crate::license_detection::query::Query;
59use crate::license_detection::spdx_mapping::{SpdxMapping, build_spdx_mapping};
60use crate::models::LicenseIndexProvenance;
61use crate::utils::text::strip_utf8_bom_str;
62
63use crate::license_detection::detection::{
64 attach_source_path_to_detections, empty_detection, populate_detection_from_group_with_spdx,
65 split_groups_across_frontmatter_boundary,
66};
67use crate::license_detection::models::MatcherKind;
68
69#[allow(dead_code)]
72pub const SCANCODE_LICENSES_RULES_PATH: &str =
73 "reference/scancode-toolkit/src/licensedcode/data/rules";
74
75#[allow(dead_code)]
78pub const SCANCODE_LICENSES_LICENSES_PATH: &str =
79 "reference/scancode-toolkit/src/licensedcode/data/licenses";
80
81#[allow(dead_code)]
84pub const SCANCODE_LICENSES_DATA_PATH: &str = "reference/scancode-toolkit/src/licensedcode/data";
85
86pub const DEFAULT_LICENSEDB_URL_TEMPLATE: &str = "https://scancode-licensedb.aboutcode.org/{}";
87#[derive(Debug, Clone, thiserror::Error)]
88pub(crate) enum LicenseDetectionError {
89 #[error("license detection timed out")]
90 Timeout,
91}
92
93pub(crate) use detection::{
94 LicenseDetection, group_matches_by_region, post_process_detections, sort_matches_by_line,
95};
96pub use models::LicenseMatch;
97
98pub use aho_match::aho_match;
99pub use hash_match::hash_match;
100pub use match_refine::{
101 filter_invalid_contained_unknown_matches, merge_overlapping_matches, refine_matches,
102 refine_matches_without_false_positive_filter, split_weak_matches,
103};
104pub use position_set::PositionSet;
105pub use spdx_lid::spdx_lid_match;
106pub use token_multiset::TokenMultiset;
107pub use token_set::TokenSet;
108pub use unknown_match::unknown_match;
109
110use self::seq_match::{
111 MAX_NEAR_DUPE_CANDIDATES, select_seq_candidates_with_deadline,
112 seq_match_with_candidates_and_deadline,
113};
114
115#[derive(Debug, Clone)]
121pub struct LicenseDetectionEngine {
122 index: Arc<index::LicenseIndex>,
123 spdx_mapping: SpdxMapping,
124 spdx_license_list_version: Option<String>,
125 license_index_provenance: Option<LicenseIndexProvenance>,
126}
127
128const MAX_DETECTION_SIZE: usize = 10 * 1024 * 1024; const MAX_REGULAR_SEQ_CANDIDATES: usize = 70;
130const MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP: usize = 8;
131const MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP: usize = 2;
132
133pub(crate) fn deadline_exceeded(deadline: Option<Instant>) -> bool {
134 deadline.is_some_and(|deadline| Instant::now() >= deadline)
135}
136
137pub(crate) fn ensure_within_deadline(
138 deadline: Option<Instant>,
139) -> Result<(), LicenseDetectionError> {
140 if deadline_exceeded(deadline) {
141 Err(LicenseDetectionError::Timeout)
142 } else {
143 Ok(())
144 }
145}
146
147fn truncate_detection_text(clean_text: &str) -> &str {
148 if clean_text.len() <= MAX_DETECTION_SIZE {
149 return clean_text;
150 }
151
152 log::debug!(
153 "Content size {} exceeds limit {}, truncating for detection",
154 clean_text.len(),
155 MAX_DETECTION_SIZE
156 );
157
158 let boundary = clean_text.floor_char_boundary(MAX_DETECTION_SIZE);
159 &clean_text[..boundary]
160}
161
162fn query_span_for_match(m: &LicenseMatch) -> Option<models::PositionSpan> {
163 (!m.query_span().is_empty()).then(|| m.query_span().clone())
164}
165
166fn has_full_match_coverage(m: &LicenseMatch) -> bool {
167 m.coverage() == 100.0
168}
169
170fn is_redundant_same_expression_seq_container(
171 container: &LicenseMatch,
172 candidate_contained_matches: &[LicenseMatch],
173) -> bool {
174 let container_is_redundant_coverage =
175 has_full_match_coverage(container) || container.coverage() >= 99.0;
176 if container.matcher != MatcherKind::Seq || !container_is_redundant_coverage {
177 return false;
178 }
179
180 let container_qspan_set = container.qspan_set();
181
182 let mut contained: Vec<&LicenseMatch> = candidate_contained_matches
183 .iter()
184 .filter(|m| {
185 m.matcher == MatcherKind::Aho
186 && has_full_match_coverage(m)
187 && m.license_expression == container.license_expression
188 && m.overlaps_with(&container_qspan_set)
189 })
190 .collect();
191
192 if contained.len() < 2 {
193 return false;
194 }
195
196 let material_children = contained.iter().filter(|m| m.matched_length > 1).count();
197 if material_children < 2 {
198 return false;
199 }
200
201 contained.sort_by_key(|m| m.qspan_bounds());
202
203 let mut child_union = PositionSet::new();
204 for m in &contained {
205 child_union.extend_from_span(m.query_span());
206 }
207
208 let container_only_positions = container_qspan_set.difference(&child_union);
209 let child_only_positions = child_union.difference(&container_qspan_set);
210
211 let mut bridge_positions = BitSet::new();
212 for pair in contained.windows(2) {
213 let (_, previous_end) = pair[0].qspan_bounds();
214 let (next_start, _) = pair[1].qspan_bounds();
215
216 if next_start < previous_end {
217 return false;
218 }
219
220 for pos in previous_end..next_start {
221 bridge_positions.insert(pos);
222 }
223 }
224
225 let container_only_boundary_positions = container_only_positions
226 .iter()
227 .filter(|&pos| !bridge_positions.contains(pos))
228 .count();
229
230 if container_only_positions.len() == 1
231 && container_only_boundary_positions == 0
232 && child_only_positions.is_empty()
233 {
234 return false;
235 }
236
237 if child_only_positions.is_empty()
238 && container_only_positions.len() == container_only_boundary_positions
239 && container_only_boundary_positions <= 3
240 {
241 let earliest_child = contained
242 .iter()
243 .map(|m| m.qspan_bounds().0)
244 .min()
245 .unwrap_or(usize::MAX);
246 let latest_child = contained
247 .iter()
248 .map(|m| m.qspan_bounds().1.saturating_sub(1))
249 .max()
250 .unwrap_or(0);
251
252 let is_one_sided_boundary = container_only_positions
253 .iter()
254 .all(|pos| pos < earliest_child)
255 || container_only_positions
256 .iter()
257 .all(|pos| pos > latest_child);
258
259 if is_one_sided_boundary {
260 return false;
261 }
262 }
263
264 let max_container_only_positions =
265 MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * contained.len() + 1;
266 let max_container_boundary_positions =
267 MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * (contained.len() - 1);
268 let max_child_only_positions = MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP + 1;
269
270 container_only_positions.len() <= max_container_only_positions
271 && container_only_boundary_positions <= max_container_boundary_positions
272 && child_only_positions.len() <= max_child_only_positions
273}
274
275fn filter_redundant_same_expression_seq_containers(
276 seq_matches: Vec<LicenseMatch>,
277 candidate_contained_matches: &[LicenseMatch],
278) -> Vec<LicenseMatch> {
279 seq_matches
280 .into_iter()
281 .filter(|m| !is_redundant_same_expression_seq_container(m, candidate_contained_matches))
282 .collect()
283}
284
285fn is_redundant_low_coverage_composite_seq_wrapper(
286 container: &LicenseMatch,
287 candidate_contained_matches: &[LicenseMatch],
288) -> bool {
289 if container.matcher != seq_match::MATCH_SEQ || container.coverage() >= 30.0 {
290 return false;
291 }
292
293 let container_qspan_set = container.qspan_set();
294
295 let children: Vec<&LicenseMatch> = candidate_contained_matches
296 .iter()
297 .filter(|m| {
298 m.matcher == aho_match::MATCH_AHO
299 && has_full_match_coverage(m)
300 && m.license_expression != container.license_expression
301 && m.overlaps_with(&container_qspan_set)
302 })
303 .collect();
304
305 if children.len() < 2 {
306 return false;
307 }
308
309 let unique_expressions: HashSet<&str> = children
310 .iter()
311 .map(|m| m.license_expression.as_str())
312 .collect();
313 if unique_expressions.len() < 2 {
314 return false;
315 }
316
317 let mut child_union = PositionSet::new();
318 for m in &children {
319 child_union.extend_from_span(m.query_span());
320 }
321
322 let container_only_positions = container_qspan_set.difference(&child_union);
323 let child_only_positions = child_union.difference(&container_qspan_set);
324
325 let mut sorted_children = children;
326 sorted_children.sort_by_key(|m| m.qspan_bounds());
327
328 let mut bridge_positions = BitSet::new();
329 for pair in sorted_children.windows(2) {
330 let (_, previous_end) = pair[0].qspan_bounds();
331 let (next_start, _) = pair[1].qspan_bounds();
332 for pos in previous_end..next_start {
333 bridge_positions.insert(pos);
334 }
335 }
336
337 let container_only_boundary_positions = container_only_positions
338 .iter()
339 .filter(|&pos| !bridge_positions.contains(pos))
340 .count();
341
342 child_only_positions.is_empty()
343 && container_only_positions.len() <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
344 && container_only_boundary_positions <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
345}
346
347fn filter_redundant_low_coverage_composite_seq_wrappers(
348 seq_matches: Vec<LicenseMatch>,
349 candidate_contained_matches: &[LicenseMatch],
350) -> Vec<LicenseMatch> {
351 seq_matches
352 .into_iter()
353 .filter(|m| {
354 !is_redundant_low_coverage_composite_seq_wrapper(m, candidate_contained_matches)
355 })
356 .collect()
357}
358
359fn subtract_spdx_match_qspans(
360 query: &mut Query<'_>,
361 matched_qspans: &mut Vec<models::PositionSpan>,
362 aho_extra_matchables: &mut PositionSet,
363 spdx_matches: &[LicenseMatch],
364) {
365 for m in spdx_matches {
366 let Some(span) = query_span_for_match(m) else {
367 continue;
368 };
369
370 aho_extra_matchables.extend_from_span(&span);
371 query.subtract(&span);
372
373 if has_full_match_coverage(m) {
374 matched_qspans.push(span);
375 }
376 }
377}
378
379fn merge_and_prepare_aho_matches(
380 index: &index::LicenseIndex,
381 query: &mut Query<'_>,
382 matched_qspans: &mut Vec<models::PositionSpan>,
383 refined_aho: &[LicenseMatch],
384) -> (Vec<LicenseMatch>, bool) {
385 let merged_aho = merge_overlapping_matches(refined_aho);
386 let mut saw_long_exact_license_text_match = false;
387
388 for m in &merged_aho {
389 let Some(span) = query_span_for_match(m) else {
390 continue;
391 };
392
393 if has_full_match_coverage(m) {
394 matched_qspans.push(span.clone());
395 }
396
397 if index.rule(m.rid).is_some_and(|rule| rule.is_license_text())
398 && m.rule_length > 120
399 && m.coverage() > 98.0
400 {
401 query.subtract(&span);
402 saw_long_exact_license_text_match = true;
403 }
404 }
405
406 (merged_aho, saw_long_exact_license_text_match)
407}
408
409fn collect_whole_query_exact_followup_matches(
410 index: &index::LicenseIndex,
411 query: &mut Query<'_>,
412 matched_qspans: &mut Vec<models::PositionSpan>,
413 whole_run: &query::QueryRun<'_>,
414 deadline: Option<Instant>,
415) -> Result<Vec<LicenseMatch>, LicenseDetectionError> {
416 let mut seq_all_matches = Vec::new();
417
418 if whole_run.is_matchable(false, matched_qspans) {
419 let near_dupe_candidates = if deadline.is_some() {
420 select_seq_candidates_with_deadline(
421 index,
422 whole_run,
423 true,
424 MAX_NEAR_DUPE_CANDIDATES,
425 deadline,
426 )?
427 } else {
428 self::seq_match::select_seq_candidates(index, whole_run, true, MAX_NEAR_DUPE_CANDIDATES)
429 };
430
431 if !near_dupe_candidates.is_empty() {
432 let near_dupe_matches = if deadline.is_some() {
433 seq_match_with_candidates_and_deadline(
434 index,
435 whole_run,
436 &near_dupe_candidates,
437 deadline,
438 )?
439 } else {
440 self::seq_match::seq_match_with_candidates(index, whole_run, &near_dupe_candidates)
441 };
442
443 for m in &near_dupe_matches {
444 if !m.query_span().is_empty() {
445 let span = m.query_span().clone();
446 query.subtract(&span);
447 matched_qspans.push(span);
448 }
449 }
450
451 seq_all_matches.extend(near_dupe_matches);
452 }
453 }
454
455 Ok(seq_all_matches)
456}
457
458fn collect_regular_seq_matches(
459 index: &index::LicenseIndex,
460 query: &Query<'_>,
461 matched_qspans: &[models::PositionSpan],
462 candidate_contained_matches: &[LicenseMatch],
463 deadline: Option<Instant>,
464) -> Result<Vec<LicenseMatch>, LicenseDetectionError> {
465 let mut seq_all_matches = Vec::new();
466
467 for (query_run_index, query_run) in query.query_runs().into_iter().enumerate() {
468 if query_run_index % 8 == 0 {
469 ensure_within_deadline(deadline)?;
470 }
471
472 if !query_run.is_matchable(false, matched_qspans) {
473 continue;
474 }
475
476 let candidates = if deadline.is_some() {
477 select_seq_candidates_with_deadline(
478 index,
479 &query_run,
480 false,
481 MAX_REGULAR_SEQ_CANDIDATES,
482 deadline,
483 )?
484 } else {
485 self::seq_match::select_seq_candidates(
486 index,
487 &query_run,
488 false,
489 MAX_REGULAR_SEQ_CANDIDATES,
490 )
491 };
492 if !candidates.is_empty() {
493 let matches = if deadline.is_some() {
494 seq_match_with_candidates_and_deadline(index, &query_run, &candidates, deadline)?
495 } else {
496 self::seq_match::seq_match_with_candidates(index, &query_run, &candidates)
497 };
498 seq_all_matches.extend(matches);
499 }
500 }
501
502 let merged_seq = merge_overlapping_matches(&seq_all_matches);
503 let filtered_same_expression =
504 filter_redundant_same_expression_seq_containers(merged_seq, candidate_contained_matches);
505 Ok(filter_redundant_low_coverage_composite_seq_wrappers(
506 filtered_same_expression,
507 candidate_contained_matches,
508 ))
509}
510
511impl LicenseDetectionEngine {
512 fn from_index(
517 index: index::LicenseIndex,
518 spdx_license_list_version: Option<String>,
519 license_index_provenance: Option<LicenseIndexProvenance>,
520 ) -> Result<Self> {
521 let mut license_vec: Vec<_> = index.licenses_by_key.values().cloned().collect();
522 license_vec.sort_by(|a, b| a.key.cmp(&b.key));
523 let spdx_mapping = build_spdx_mapping(&license_vec);
524
525 Ok(Self {
526 index: Arc::new(index),
527 spdx_mapping,
528 spdx_license_list_version,
529 license_index_provenance,
530 })
531 }
532
533 #[cfg(test)]
534 pub(crate) fn from_test_index(index: index::LicenseIndex) -> Self {
535 Self::from_index(index, None, None).expect("test index should build license engine")
536 }
537
538 pub fn from_embedded() -> Result<Self> {
543 let cache_config =
544 LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
545 Self::from_embedded_with_cache(&cache_config)
546 }
547
548 pub fn from_embedded_with_cache(cache_config: &LicenseCacheConfig) -> Result<Self> {
563 let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
564 let fingerprint = compute_artifact_fingerprint(artifact_bytes);
565 let artifact_metadata = load_embedded_artifact_metadata_from_bytes(artifact_bytes)
566 .map_err(|e| {
567 anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
568 })?;
569 debug_assert_eq!(
570 artifact_metadata.license_index_provenance.source,
571 EMBEDDED_LICENSE_INDEX_SOURCE
572 );
573 let spdx_version = Some(artifact_metadata.spdx_license_list_version.clone());
574 let provenance = Some(artifact_metadata.license_index_provenance.clone());
575
576 if !cache_config.reindex {
577 if let Some(cached) =
578 load_cached_index(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?
579 {
580 let start = Instant::now();
581 eprintln!(
582 "License index loaded from rkyv cache in {:.2}s",
583 start.elapsed().as_secs_f64()
584 );
585 return Self::from_index(cached, spdx_version, provenance);
586 }
587 } else {
588 delete_cache(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?;
589 }
590
591 let snapshot = load_loader_snapshot_from_bytes(artifact_bytes)
592 .map_err(|e| anyhow::anyhow!("Failed to load embedded license index: {}", e))?;
593 let spdx_version = Some(snapshot.metadata.spdx_license_list_version.clone());
594 let provenance = Some(snapshot.metadata.license_index_provenance.clone());
595
596 let start = Instant::now();
597 let index = build_index_from_loaded(snapshot.rules, snapshot.licenses, false);
598 eprintln!(
599 "License index built from embedded artifact in {:.2}s",
600 start.elapsed().as_secs_f64()
601 );
602
603 let mut index = index;
604 index.spdx_license_list_version = spdx_version.clone();
605 if let Err(e) = save_cached_index(
606 cache_config,
607 LicenseCacheNamespace::Embedded,
608 &index,
609 &fingerprint,
610 ) {
611 eprintln!("Warning: failed to save license index cache: {}", e);
612 } else if let Some(size) =
613 cache_file_size(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)
614 {
615 eprintln!(
616 "License index cache saved ({:.1} MB)",
617 size as f64 / 1_048_576.0
618 );
619 }
620
621 Self::from_index(index, spdx_version, provenance)
622 }
623
624 pub fn from_directory(rules_path: &Path) -> Result<Self> {
629 let cache_config =
630 LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
631 Self::from_directory_with_cache(rules_path, &cache_config)
632 }
633
634 pub fn from_directory_with_cache(
646 rules_path: &Path,
647 cache_config: &LicenseCacheConfig,
648 ) -> Result<Self> {
649 let LoadedLicenseDataset {
650 manifest,
651 rules: loaded_rules,
652 licenses: loaded_licenses,
653 } = load_license_dataset_from_root(rules_path)?;
654
655 let fingerprint = compute_rules_fingerprint(&loaded_rules, &loaded_licenses)?;
656 let provenance = Some(LicenseIndexProvenance {
657 source: CUSTOM_LICENSE_DATASET_SOURCE.to_string(),
658 dataset_fingerprint: compute_dataset_fingerprint_string(
659 &loaded_rules,
660 &loaded_licenses,
661 )?,
662 ignored_rules: vec![],
663 ignored_licenses: vec![],
664 ignored_rules_due_to_licenses: vec![],
665 added_rules: vec![],
666 replaced_rules: vec![],
667 added_licenses: vec![],
668 replaced_licenses: vec![],
669 });
670
671 if !cache_config.reindex {
672 if let Some(cached) = load_cached_index(
673 cache_config,
674 LicenseCacheNamespace::CustomRules,
675 &fingerprint,
676 )? {
677 let start = Instant::now();
678 eprintln!(
679 "License index loaded from rkyv cache in {:.2}s",
680 start.elapsed().as_secs_f64()
681 );
682 return Self::from_index(
683 cached,
684 Some(manifest.spdx_license_list_version),
685 provenance,
686 );
687 }
688 } else {
689 delete_cache(
690 cache_config,
691 LicenseCacheNamespace::CustomRules,
692 &fingerprint,
693 )?;
694 }
695
696 let start = Instant::now();
697 let index = build_index_from_loaded(loaded_rules, loaded_licenses, false);
698 eprintln!(
699 "License index built from custom dataset in {:.2}s",
700 start.elapsed().as_secs_f64()
701 );
702
703 if let Err(e) = save_cached_index(
704 cache_config,
705 LicenseCacheNamespace::CustomRules,
706 &index,
707 &fingerprint,
708 ) {
709 eprintln!("Warning: failed to save license index cache: {}", e);
710 } else if let Some(size) = cache_file_size(
711 cache_config,
712 LicenseCacheNamespace::CustomRules,
713 &fingerprint,
714 ) {
715 eprintln!(
716 "License index cache saved ({:.1} MB)",
717 size as f64 / 1_048_576.0
718 );
719 }
720
721 Self::from_index(index, Some(manifest.spdx_license_list_version), provenance)
722 }
723
724 pub fn embedded_spdx_license_list_version() -> Result<String> {
725 let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
726 Ok(load_embedded_artifact_metadata_from_bytes(artifact_bytes)
727 .map_err(|e| {
728 anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
729 })?
730 .spdx_license_list_version)
731 }
732
733 pub fn detect_with_kind(
734 &self,
735 text: &str,
736 unknown_licenses: bool,
737 binary_derived: bool,
738 ) -> Result<Vec<LicenseDetection>> {
739 self.detect_with_kind_with_score_and_deadline(
740 text,
741 unknown_licenses,
742 binary_derived,
743 0.0,
744 None,
745 )
746 .map_err(Into::into)
747 }
748
749 pub fn detect_with_kind_with_score(
750 &self,
751 text: &str,
752 unknown_licenses: bool,
753 binary_derived: bool,
754 min_score: f32,
755 ) -> Result<Vec<LicenseDetection>> {
756 self.detect_with_kind_with_score_and_deadline(
757 text,
758 unknown_licenses,
759 binary_derived,
760 min_score,
761 None,
762 )
763 .map_err(Into::into)
764 }
765
766 pub(crate) fn detect_with_kind_with_score_and_deadline(
767 &self,
768 text: &str,
769 unknown_licenses: bool,
770 binary_derived: bool,
771 min_score: f32,
772 deadline: Option<Instant>,
773 ) -> Result<Vec<LicenseDetection>, LicenseDetectionError> {
774 ensure_within_deadline(deadline)?;
775 let clean_text = strip_utf8_bom_str(text);
776
777 let content = truncate_detection_text(clean_text);
778
779 ensure_within_deadline(deadline)?;
780 let mut query = if deadline.is_some() {
781 Query::from_extracted_text_with_deadline(
782 content,
783 &self.index,
784 binary_derived,
785 deadline,
786 )?
787 } else {
788 Query::from_extracted_text(content, &self.index, binary_derived)?
789 };
790 let whole_query_run = query.whole_query_run();
791
792 let mut all_matches = Vec::new();
793 let mut candidate_contained_matches = Vec::new();
794 let mut aho_extra_matchables = PositionSet::new();
795 let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
796
797 {
800 ensure_within_deadline(deadline)?;
801 let hash_matches = hash_match(&self.index, &whole_query_run);
802
803 if !hash_matches.is_empty() {
804 let mut matches = hash_matches;
805 sort_matches_by_line(&mut matches);
806
807 let groups = split_groups_across_frontmatter_boundary(
808 group_matches_by_region(&matches),
809 Some(content),
810 );
811 let detections: Vec<LicenseDetection> = groups
812 .iter()
813 .map(|group| {
814 let mut detection = empty_detection();
815 populate_detection_from_group_with_spdx(
816 &mut detection,
817 group,
818 &self.spdx_mapping,
819 Some(content),
820 );
821 detection
822 })
823 .collect();
824
825 return Ok(post_process_detections(detections, min_score));
826 }
827 }
828
829 {
831 ensure_within_deadline(deadline)?;
832 let spdx_matches = spdx_lid_match(&self.index, &query);
833 subtract_spdx_match_qspans(
834 &mut query,
835 &mut matched_qspans,
836 &mut aho_extra_matchables,
837 &spdx_matches,
838 );
839 all_matches.extend(spdx_matches);
840 }
841
842 {
844 ensure_within_deadline(deadline)?;
845 let aho_matches = if aho_extra_matchables.is_empty() {
846 if deadline.is_some() {
847 aho_match::aho_match_with_deadline(&self.index, &whole_query_run, deadline)?
848 } else {
849 aho_match(&self.index, &whole_query_run)
850 }
851 } else {
852 if deadline.is_some() {
853 aho_match::aho_match_with_extra_matchables(
854 &self.index,
855 &whole_query_run,
856 Some(&aho_extra_matchables),
857 deadline,
858 )?
859 } else {
860 aho_match::aho_match_with_extra_matchables(
861 &self.index,
862 &whole_query_run,
863 Some(&aho_extra_matchables),
864 None,
865 )?
866 }
867 };
868
869 let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
872 candidate_contained_matches.extend(refined_aho.clone());
873 let (merged_aho, _) = merge_and_prepare_aho_matches(
874 &self.index,
875 &mut query,
876 &mut matched_qspans,
877 &refined_aho,
878 );
879 all_matches.extend(merged_aho);
880
881 let whole_query_followup = collect_whole_query_exact_followup_matches(
882 &self.index,
883 &mut query,
884 &mut matched_qspans,
885 &whole_query_run,
886 deadline,
887 )?;
888 all_matches.extend(whole_query_followup);
889
890 let merged_seq = collect_regular_seq_matches(
891 &self.index,
892 &query,
893 &matched_qspans,
894 &candidate_contained_matches,
895 deadline,
896 )?;
897 all_matches.extend(merged_seq);
898 }
899
900 ensure_within_deadline(deadline)?;
903 let merged_matches =
904 refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
905
906 let refined_matches = if unknown_licenses {
909 let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
911
912 let unknown_matches = unknown_match(&self.index, &query, &good_matches);
914 let filtered_unknown =
915 filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
916
917 let mut all_matches = good_matches;
918 all_matches.extend(filtered_unknown);
919 all_matches.extend(weak_matches);
922 all_matches
923 } else {
924 merged_matches
925 };
926
927 ensure_within_deadline(deadline)?;
929 let refined = refine_matches(&self.index, refined_matches, &query);
930
931 let mut sorted = refined;
932 sort_matches_by_line(&mut sorted);
933
934 let groups = split_groups_across_frontmatter_boundary(
935 group_matches_by_region(&sorted),
936 Some(content),
937 );
938
939 let detections: Vec<LicenseDetection> = groups
940 .iter()
941 .map(|group| {
942 let mut detection = empty_detection();
943 populate_detection_from_group_with_spdx(
944 &mut detection,
945 group,
946 &self.spdx_mapping,
947 Some(content),
948 );
949 detection
950 })
951 .collect();
952
953 let detections = post_process_detections(detections, min_score);
954
955 ensure_within_deadline(deadline)?;
956 Ok(detections)
957 }
958
959 pub fn detect_with_kind_and_source(
960 &self,
961 text: &str,
962 unknown_licenses: bool,
963 binary_derived: bool,
964 source_path: &str,
965 ) -> Result<Vec<LicenseDetection>> {
966 self.detect_with_kind_and_source_with_deadline(
967 text,
968 unknown_licenses,
969 binary_derived,
970 source_path,
971 None,
972 )
973 .map_err(Into::into)
974 }
975
976 pub(crate) fn detect_with_kind_and_source_with_deadline(
977 &self,
978 text: &str,
979 unknown_licenses: bool,
980 binary_derived: bool,
981 source_path: &str,
982 deadline: Option<Instant>,
983 ) -> Result<Vec<LicenseDetection>, LicenseDetectionError> {
984 let mut detections = self.detect_with_kind_with_score_and_deadline(
985 text,
986 unknown_licenses,
987 binary_derived,
988 0.0,
989 deadline,
990 )?;
991 attach_source_path_to_detections(&mut detections, source_path);
992 Ok(detections)
993 }
994
995 pub fn detect_with_kind_and_source_with_score(
996 &self,
997 text: &str,
998 unknown_licenses: bool,
999 binary_derived: bool,
1000 source_path: &str,
1001 min_score: f32,
1002 ) -> Result<Vec<LicenseDetection>> {
1003 let mut detections =
1004 self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, min_score)?;
1005 attach_source_path_to_detections(&mut detections, source_path);
1006 Ok(detections)
1007 }
1008
1009 pub(crate) fn detect_with_kind_and_source_with_score_and_deadline(
1010 &self,
1011 text: &str,
1012 unknown_licenses: bool,
1013 binary_derived: bool,
1014 source_path: &str,
1015 min_score: f32,
1016 deadline: Option<Instant>,
1017 ) -> Result<Vec<LicenseDetection>, LicenseDetectionError> {
1018 let mut detections = self.detect_with_kind_with_score_and_deadline(
1019 text,
1020 unknown_licenses,
1021 binary_derived,
1022 min_score,
1023 deadline,
1024 )?;
1025 attach_source_path_to_detections(&mut detections, source_path);
1026 Ok(detections)
1027 }
1028
1029 #[cfg(any(test, feature = "golden-tests"))]
1034 pub fn detect_matches_with_kind(
1035 &self,
1036 text: &str,
1037 unknown_licenses: bool,
1038 binary_derived: bool,
1039 ) -> Result<Vec<LicenseMatch>> {
1040 let clean_text = strip_utf8_bom_str(text);
1041
1042 let content = truncate_detection_text(clean_text);
1043
1044 let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
1045 let whole_query_run = query.whole_query_run();
1046
1047 let mut all_matches = Vec::new();
1048 let mut candidate_contained_matches = Vec::new();
1049 let mut aho_extra_matchables = PositionSet::new();
1050 let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
1051
1052 {
1054 let hash_matches = hash_match(&self.index, &whole_query_run);
1055
1056 if !hash_matches.is_empty() {
1057 let mut matches = hash_matches;
1058 sort_matches_by_line(&mut matches);
1059 return Ok(matches);
1060 }
1061 }
1062
1063 {
1065 let spdx_matches = spdx_lid_match(&self.index, &query);
1066 subtract_spdx_match_qspans(
1067 &mut query,
1068 &mut matched_qspans,
1069 &mut aho_extra_matchables,
1070 &spdx_matches,
1071 );
1072 all_matches.extend(spdx_matches);
1073 }
1074
1075 {
1077 let aho_matches = if aho_extra_matchables.is_empty() {
1078 aho_match(&self.index, &whole_query_run)
1079 } else {
1080 aho_match::aho_match_with_extra_matchables(
1081 &self.index,
1082 &whole_query_run,
1083 Some(&aho_extra_matchables),
1084 None,
1085 )?
1086 };
1087 let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
1088 candidate_contained_matches.extend(refined_aho.clone());
1089 let (merged_aho, _) = merge_and_prepare_aho_matches(
1090 &self.index,
1091 &mut query,
1092 &mut matched_qspans,
1093 &refined_aho,
1094 );
1095 all_matches.extend(merged_aho);
1096
1097 let whole_query_followup = collect_whole_query_exact_followup_matches(
1098 &self.index,
1099 &mut query,
1100 &mut matched_qspans,
1101 &whole_query_run,
1102 None,
1103 )?;
1104 all_matches.extend(whole_query_followup);
1105
1106 let merged_seq = collect_regular_seq_matches(
1107 &self.index,
1108 &query,
1109 &matched_qspans,
1110 &candidate_contained_matches,
1111 None,
1112 )?;
1113 all_matches.extend(merged_seq);
1114 }
1115
1116 let merged_matches =
1118 refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
1119
1120 let refined_matches = if unknown_licenses {
1122 let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
1123 let unknown_matches = unknown_match(&self.index, &query, &good_matches);
1124 let filtered_unknown =
1125 filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
1126
1127 let mut all_matches = good_matches;
1128 all_matches.extend(filtered_unknown);
1129 all_matches.extend(weak_matches);
1130 all_matches
1131 } else {
1132 merged_matches
1133 };
1134
1135 let refined = refine_matches(&self.index, refined_matches, &query);
1137
1138 let mut sorted = refined;
1139 sort_matches_by_line(&mut sorted);
1140
1141 Ok(sorted)
1143 }
1144
1145 pub fn index(&self) -> &index::LicenseIndex {
1147 &self.index
1148 }
1149
1150 pub fn spdx_license_list_version(&self) -> Option<&str> {
1151 self.spdx_license_list_version.as_deref()
1152 }
1153
1154 pub fn license_index_provenance(&self) -> Option<&LicenseIndexProvenance> {
1155 self.license_index_provenance.as_ref()
1156 }
1157
1158 #[cfg(test)]
1160 pub fn spdx_mapping(&self) -> &SpdxMapping {
1161 &self.spdx_mapping
1162 }
1163}
1164
1165pub fn detect_scancode_spdx_license_list_version(search_path: &Path) -> Result<Option<String>> {
1166 for ancestor in search_path.ancestors() {
1167 let candidate = ancestor.join("scancode_config.py");
1168 if candidate.is_file() {
1169 let config = fs::read_to_string(&candidate)?;
1170 return Ok(parse_scancode_spdx_license_list_version(&config));
1171 }
1172 }
1173
1174 Ok(None)
1175}
1176
1177fn parse_scancode_spdx_license_list_version(config: &str) -> Option<String> {
1178 config.lines().find_map(|line| {
1179 let trimmed = line.trim();
1180 let (_, value) = trimmed.split_once('=')?;
1181 (trimmed.starts_with("spdx_license_list_version")).then(|| {
1182 value
1183 .trim()
1184 .trim_matches('"')
1185 .trim_matches('\'')
1186 .to_string()
1187 })
1188 })
1189}
1190
1191#[cfg(test)]
1192mod tests;