1pub mod aho_match;
4pub mod automaton;
5pub(crate) mod detection;
6pub mod embedded;
7pub mod license_cache;
8mod position_set;
9mod token_multiset;
10mod token_set;
11
12#[cfg(test)]
13mod embedded_test;
14pub mod expression;
15#[cfg(all(test, feature = "golden-tests"))]
16mod golden_test;
17#[cfg(feature = "golden-tests")]
18pub mod golden_utils;
19pub mod hash_match;
20pub mod index;
21mod match_refine;
22pub mod models;
23pub mod query;
24pub mod rules;
25pub mod seq_match;
26pub mod spdx_lid;
27pub mod spdx_mapping;
28#[cfg(test)]
29mod test_utils;
30pub mod tokenize;
31pub mod unknown_match;
32
33use bit_set::BitSet;
34use std::collections::HashSet;
35use std::fs;
36use std::path::Path;
37use std::sync::Arc;
38use std::time::Instant;
39
40use anyhow::Result;
41
42use crate::license_detection::embedded::index::{
43 load_embedded_artifact_metadata_from_bytes, load_loader_snapshot_from_bytes,
44};
45use crate::license_detection::index::CachedLicenseIndex;
46use crate::license_detection::index::build_index_from_loaded;
47use crate::license_detection::license_cache::{
48 LicenseCacheConfig, LicenseCacheNamespace, cache_file_size, compute_artifact_fingerprint,
49 compute_rules_fingerprint, delete_cache, load_cached_index, save_cached_index,
50};
51use crate::license_detection::query::Query;
52use crate::license_detection::rules::{
53 load_loaded_licenses_from_directory, load_loaded_rules_from_directory,
54};
55use crate::license_detection::spdx_mapping::{SpdxMapping, build_spdx_mapping};
56use crate::utils::text::strip_utf8_bom_str;
57
58use crate::license_detection::detection::{
59 attach_source_path_to_detections, empty_detection, populate_detection_from_group_with_spdx,
60};
61use crate::license_detection::models::MatcherKind;
62
63#[allow(dead_code)]
66pub const SCANCODE_LICENSES_RULES_PATH: &str =
67 "reference/scancode-toolkit/src/licensedcode/data/rules";
68
69#[allow(dead_code)]
72pub const SCANCODE_LICENSES_LICENSES_PATH: &str =
73 "reference/scancode-toolkit/src/licensedcode/data/licenses";
74
75#[allow(dead_code)]
78pub const SCANCODE_LICENSES_DATA_PATH: &str = "reference/scancode-toolkit/src/licensedcode/data";
79
80pub const DEFAULT_LICENSEDB_URL_TEMPLATE: &str = "https://scancode-licensedb.aboutcode.org/{}";
81
82pub(crate) use detection::{
83 LicenseDetection, group_matches_by_region, post_process_detections, sort_matches_by_line,
84};
85pub use models::LicenseMatch;
86
87pub use aho_match::aho_match;
88pub use hash_match::hash_match;
89pub use match_refine::{
90 filter_invalid_contained_unknown_matches, merge_overlapping_matches, refine_matches,
91 refine_matches_without_false_positive_filter, split_weak_matches,
92};
93pub use position_set::PositionSet;
94pub use spdx_lid::spdx_lid_match;
95pub use token_multiset::TokenMultiset;
96pub use token_set::TokenSet;
97pub use unknown_match::unknown_match;
98
99use self::seq_match::{MAX_NEAR_DUPE_CANDIDATES, select_seq_candidates, seq_match_with_candidates};
100
101#[derive(Debug, Clone)]
107pub struct LicenseDetectionEngine {
108 index: Arc<index::LicenseIndex>,
109 spdx_mapping: SpdxMapping,
110 spdx_license_list_version: Option<String>,
111}
112
113const MAX_DETECTION_SIZE: usize = 10 * 1024 * 1024; const MAX_REGULAR_SEQ_CANDIDATES: usize = 70;
115const MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP: usize = 8;
116const MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP: usize = 2;
117
118fn truncate_detection_text(clean_text: &str) -> &str {
119 if clean_text.len() <= MAX_DETECTION_SIZE {
120 return clean_text;
121 }
122
123 log::debug!(
124 "Content size {} exceeds limit {}, truncating for detection",
125 clean_text.len(),
126 MAX_DETECTION_SIZE
127 );
128
129 let boundary = clean_text.floor_char_boundary(MAX_DETECTION_SIZE);
130 &clean_text[..boundary]
131}
132
133fn query_span_for_match(m: &LicenseMatch) -> Option<models::PositionSpan> {
134 (!m.query_span().is_empty()).then(|| m.query_span().clone())
135}
136
137fn has_full_match_coverage(m: &LicenseMatch) -> bool {
138 m.coverage() == 100.0
139}
140
141fn is_redundant_same_expression_seq_container(
142 container: &LicenseMatch,
143 candidate_contained_matches: &[LicenseMatch],
144) -> bool {
145 let container_is_redundant_coverage =
146 has_full_match_coverage(container) || container.coverage() >= 99.0;
147 if container.matcher != MatcherKind::Seq || !container_is_redundant_coverage {
148 return false;
149 }
150
151 let container_qspan_set = container.qspan_set();
152
153 let mut contained: Vec<&LicenseMatch> = candidate_contained_matches
154 .iter()
155 .filter(|m| {
156 m.matcher == MatcherKind::Aho
157 && has_full_match_coverage(m)
158 && m.license_expression == container.license_expression
159 && m.overlaps_with(&container_qspan_set)
160 })
161 .collect();
162
163 if contained.len() < 2 {
164 return false;
165 }
166
167 let material_children = contained.iter().filter(|m| m.matched_length > 1).count();
168 if material_children < 2 {
169 return false;
170 }
171
172 contained.sort_by_key(|m| m.qspan_bounds());
173
174 let mut child_union = PositionSet::new();
175 for m in &contained {
176 child_union.extend_from_span(m.query_span());
177 }
178
179 let container_only_positions = container_qspan_set.difference(&child_union);
180 let child_only_positions = child_union.difference(&container_qspan_set);
181
182 let mut bridge_positions = BitSet::new();
183 for pair in contained.windows(2) {
184 let (_, previous_end) = pair[0].qspan_bounds();
185 let (next_start, _) = pair[1].qspan_bounds();
186
187 if next_start < previous_end {
188 return false;
189 }
190
191 for pos in previous_end..next_start {
192 bridge_positions.insert(pos);
193 }
194 }
195
196 let container_only_boundary_positions = container_only_positions
197 .iter()
198 .filter(|&pos| !bridge_positions.contains(pos))
199 .count();
200
201 if container_only_positions.len() == 1
202 && container_only_boundary_positions == 0
203 && child_only_positions.is_empty()
204 {
205 return false;
206 }
207
208 if child_only_positions.is_empty()
209 && container_only_positions.len() == container_only_boundary_positions
210 && container_only_boundary_positions <= 3
211 {
212 let earliest_child = contained
213 .iter()
214 .map(|m| m.qspan_bounds().0)
215 .min()
216 .unwrap_or(usize::MAX);
217 let latest_child = contained
218 .iter()
219 .map(|m| m.qspan_bounds().1.saturating_sub(1))
220 .max()
221 .unwrap_or(0);
222
223 let is_one_sided_boundary = container_only_positions
224 .iter()
225 .all(|pos| pos < earliest_child)
226 || container_only_positions
227 .iter()
228 .all(|pos| pos > latest_child);
229
230 if is_one_sided_boundary {
231 return false;
232 }
233 }
234
235 let max_container_only_positions =
236 MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * contained.len() + 1;
237 let max_container_boundary_positions =
238 MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * (contained.len() - 1);
239 let max_child_only_positions = MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP + 1;
240
241 container_only_positions.len() <= max_container_only_positions
242 && container_only_boundary_positions <= max_container_boundary_positions
243 && child_only_positions.len() <= max_child_only_positions
244}
245
246fn filter_redundant_same_expression_seq_containers(
247 seq_matches: Vec<LicenseMatch>,
248 candidate_contained_matches: &[LicenseMatch],
249) -> Vec<LicenseMatch> {
250 seq_matches
251 .into_iter()
252 .filter(|m| !is_redundant_same_expression_seq_container(m, candidate_contained_matches))
253 .collect()
254}
255
256fn is_redundant_low_coverage_composite_seq_wrapper(
257 container: &LicenseMatch,
258 candidate_contained_matches: &[LicenseMatch],
259) -> bool {
260 if container.matcher != seq_match::MATCH_SEQ || container.coverage() >= 30.0 {
261 return false;
262 }
263
264 let container_qspan_set = container.qspan_set();
265
266 let children: Vec<&LicenseMatch> = candidate_contained_matches
267 .iter()
268 .filter(|m| {
269 m.matcher == aho_match::MATCH_AHO
270 && has_full_match_coverage(m)
271 && m.license_expression != container.license_expression
272 && m.overlaps_with(&container_qspan_set)
273 })
274 .collect();
275
276 if children.len() < 2 {
277 return false;
278 }
279
280 let unique_expressions: HashSet<&str> = children
281 .iter()
282 .map(|m| m.license_expression.as_str())
283 .collect();
284 if unique_expressions.len() < 2 {
285 return false;
286 }
287
288 let mut child_union = PositionSet::new();
289 for m in &children {
290 child_union.extend_from_span(m.query_span());
291 }
292
293 let container_only_positions = container_qspan_set.difference(&child_union);
294 let child_only_positions = child_union.difference(&container_qspan_set);
295
296 let mut sorted_children = children;
297 sorted_children.sort_by_key(|m| m.qspan_bounds());
298
299 let mut bridge_positions = BitSet::new();
300 for pair in sorted_children.windows(2) {
301 let (_, previous_end) = pair[0].qspan_bounds();
302 let (next_start, _) = pair[1].qspan_bounds();
303 for pos in previous_end..next_start {
304 bridge_positions.insert(pos);
305 }
306 }
307
308 let container_only_boundary_positions = container_only_positions
309 .iter()
310 .filter(|&pos| !bridge_positions.contains(pos))
311 .count();
312
313 child_only_positions.is_empty()
314 && container_only_positions.len() <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
315 && container_only_boundary_positions <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
316}
317
318fn filter_redundant_low_coverage_composite_seq_wrappers(
319 seq_matches: Vec<LicenseMatch>,
320 candidate_contained_matches: &[LicenseMatch],
321) -> Vec<LicenseMatch> {
322 seq_matches
323 .into_iter()
324 .filter(|m| {
325 !is_redundant_low_coverage_composite_seq_wrapper(m, candidate_contained_matches)
326 })
327 .collect()
328}
329
330fn subtract_spdx_match_qspans(
331 query: &mut Query<'_>,
332 matched_qspans: &mut Vec<models::PositionSpan>,
333 aho_extra_matchables: &mut PositionSet,
334 spdx_matches: &[LicenseMatch],
335) {
336 for m in spdx_matches {
337 let Some(span) = query_span_for_match(m) else {
338 continue;
339 };
340
341 aho_extra_matchables.extend_from_span(&span);
342 query.subtract(&span);
343
344 if has_full_match_coverage(m) {
345 matched_qspans.push(span);
346 }
347 }
348}
349
350fn merge_and_prepare_aho_matches(
351 index: &index::LicenseIndex,
352 query: &mut Query<'_>,
353 matched_qspans: &mut Vec<models::PositionSpan>,
354 refined_aho: &[LicenseMatch],
355) -> (Vec<LicenseMatch>, bool) {
356 let merged_aho = merge_overlapping_matches(refined_aho);
357 let mut saw_long_exact_license_text_match = false;
358
359 for m in &merged_aho {
360 let Some(span) = query_span_for_match(m) else {
361 continue;
362 };
363
364 if has_full_match_coverage(m) {
365 matched_qspans.push(span.clone());
366 }
367
368 if index
369 .rules_by_rid
370 .get(m.rid)
371 .is_some_and(|rule| rule.is_license_text())
372 && m.rule_length > 120
373 && m.coverage() > 98.0
374 {
375 query.subtract(&span);
376 saw_long_exact_license_text_match = true;
377 }
378 }
379
380 (merged_aho, saw_long_exact_license_text_match)
381}
382
383fn collect_whole_query_exact_followup_matches(
384 index: &index::LicenseIndex,
385 query: &mut Query<'_>,
386 matched_qspans: &mut Vec<models::PositionSpan>,
387 whole_run: &query::QueryRun<'_>,
388) -> Vec<LicenseMatch> {
389 let mut seq_all_matches = Vec::new();
390
391 if whole_run.is_matchable(false, matched_qspans) {
392 let near_dupe_candidates =
393 select_seq_candidates(index, whole_run, true, MAX_NEAR_DUPE_CANDIDATES);
394
395 if !near_dupe_candidates.is_empty() {
396 let near_dupe_matches =
397 seq_match_with_candidates(index, whole_run, &near_dupe_candidates);
398
399 for m in &near_dupe_matches {
400 if !m.query_span().is_empty() {
401 let span = m.query_span().clone();
402 query.subtract(&span);
403 matched_qspans.push(span);
404 }
405 }
406
407 seq_all_matches.extend(near_dupe_matches);
408 }
409 }
410
411 seq_all_matches
412}
413
414fn collect_regular_seq_matches(
415 index: &index::LicenseIndex,
416 query: &Query<'_>,
417 matched_qspans: &[models::PositionSpan],
418 candidate_contained_matches: &[LicenseMatch],
419) -> Vec<LicenseMatch> {
420 let mut seq_all_matches = Vec::new();
421
422 for query_run in query.query_runs() {
423 if !query_run.is_matchable(false, matched_qspans) {
424 continue;
425 }
426
427 let candidates =
428 select_seq_candidates(index, &query_run, false, MAX_REGULAR_SEQ_CANDIDATES);
429 if !candidates.is_empty() {
430 let matches = seq_match_with_candidates(index, &query_run, &candidates);
431 seq_all_matches.extend(matches);
432 }
433 }
434
435 let merged_seq = merge_overlapping_matches(&seq_all_matches);
436 let filtered_same_expression =
437 filter_redundant_same_expression_seq_containers(merged_seq, candidate_contained_matches);
438 filter_redundant_low_coverage_composite_seq_wrappers(
439 filtered_same_expression,
440 candidate_contained_matches,
441 )
442}
443
444impl LicenseDetectionEngine {
445 fn from_index(
450 index: index::LicenseIndex,
451 spdx_license_list_version: Option<String>,
452 ) -> Result<Self> {
453 let mut license_vec: Vec<_> = index.licenses_by_key.values().cloned().collect();
454 license_vec.sort_by(|a, b| a.key.cmp(&b.key));
455 let spdx_mapping = build_spdx_mapping(&license_vec);
456
457 Ok(Self {
458 index: Arc::new(index),
459 spdx_mapping,
460 spdx_license_list_version,
461 })
462 }
463
464 pub fn from_embedded() -> Result<Self> {
469 let cache_config =
470 LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
471 Self::from_embedded_with_cache(&cache_config)
472 }
473
474 pub fn from_embedded_with_cache(cache_config: &LicenseCacheConfig) -> Result<Self> {
489 let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
490 let fingerprint = compute_artifact_fingerprint(artifact_bytes);
491
492 if !cache_config.reindex {
493 if let Some(cached) =
494 load_cached_index(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?
495 {
496 let start = Instant::now();
497 let spdx_version = cached.spdx_license_list_version.clone();
498 let index = index::LicenseIndex::from(cached);
499 eprintln!(
500 "License index loaded from rkyv cache in {:.2}s",
501 start.elapsed().as_secs_f64()
502 );
503 return Self::from_index(index, spdx_version);
504 }
505 } else {
506 delete_cache(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?;
507 }
508
509 let snapshot = load_loader_snapshot_from_bytes(artifact_bytes)
510 .map_err(|e| anyhow::anyhow!("Failed to load embedded license index: {}", e))?;
511 let spdx_version = Some(snapshot.metadata.spdx_license_list_version.clone());
512
513 let start = Instant::now();
514 let index = build_index_from_loaded(snapshot.rules, snapshot.licenses, false);
515 eprintln!(
516 "License index built from embedded artifact in {:.2}s",
517 start.elapsed().as_secs_f64()
518 );
519
520 let mut cached = CachedLicenseIndex::from(index.clone());
521 cached.spdx_license_list_version = spdx_version.clone();
522 if let Err(e) = save_cached_index(
523 cache_config,
524 LicenseCacheNamespace::Embedded,
525 &cached,
526 &fingerprint,
527 ) {
528 eprintln!("Warning: failed to save license index cache: {}", e);
529 } else if let Some(size) =
530 cache_file_size(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)
531 {
532 eprintln!(
533 "License index cache saved ({:.1} MB)",
534 size as f64 / 1_048_576.0
535 );
536 }
537
538 Self::from_index(index, spdx_version)
539 }
540
541 pub fn from_directory(rules_path: &Path) -> Result<Self> {
546 let cache_config =
547 LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
548 Self::from_directory_with_cache(rules_path, &cache_config)
549 }
550
551 pub fn from_directory_with_cache(
563 rules_path: &Path,
564 cache_config: &LicenseCacheConfig,
565 ) -> Result<Self> {
566 let (rules_dir, licenses_dir) = if rules_path.ends_with("data") {
567 (rules_path.join("rules"), rules_path.join("licenses"))
568 } else if rules_path.ends_with("rules") {
569 let parent = rules_path.parent().ok_or_else(|| {
570 anyhow::anyhow!("Cannot determine parent directory for rules path")
571 })?;
572 (rules_path.to_path_buf(), parent.join("licenses"))
573 } else {
574 (rules_path.to_path_buf(), rules_path.to_path_buf())
575 };
576
577 let loaded_rules = load_loaded_rules_from_directory(&rules_dir)?;
578 let loaded_licenses = load_loaded_licenses_from_directory(&licenses_dir)?;
579
580 let fingerprint = compute_rules_fingerprint(&loaded_rules, &loaded_licenses);
581
582 if !cache_config.reindex {
583 if let Some(cached) = load_cached_index(
584 cache_config,
585 LicenseCacheNamespace::CustomRules,
586 &fingerprint,
587 )? {
588 let start = Instant::now();
589 let index = index::LicenseIndex::from(cached);
590 eprintln!(
591 "License index loaded from rkyv cache in {:.2}s",
592 start.elapsed().as_secs_f64()
593 );
594 let spdx_version = detect_scancode_spdx_license_list_version(&rules_dir)?;
595 return Self::from_index(index, spdx_version);
596 }
597 } else {
598 delete_cache(
599 cache_config,
600 LicenseCacheNamespace::CustomRules,
601 &fingerprint,
602 )?;
603 }
604
605 let start = Instant::now();
606 let index = build_index_from_loaded(loaded_rules, loaded_licenses, false);
607 eprintln!(
608 "License index built from rules directory in {:.2}s",
609 start.elapsed().as_secs_f64()
610 );
611
612 let spdx_license_list_version = detect_scancode_spdx_license_list_version(&rules_dir)?;
613
614 let cached = CachedLicenseIndex::from(index.clone());
615 if let Err(e) = save_cached_index(
616 cache_config,
617 LicenseCacheNamespace::CustomRules,
618 &cached,
619 &fingerprint,
620 ) {
621 eprintln!("Warning: failed to save license index cache: {}", e);
622 } else if let Some(size) = cache_file_size(
623 cache_config,
624 LicenseCacheNamespace::CustomRules,
625 &fingerprint,
626 ) {
627 eprintln!(
628 "License index cache saved ({:.1} MB)",
629 size as f64 / 1_048_576.0
630 );
631 }
632
633 Self::from_index(index, spdx_license_list_version)
634 }
635
636 pub fn embedded_spdx_license_list_version() -> Result<String> {
637 let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
638 Ok(load_embedded_artifact_metadata_from_bytes(artifact_bytes)
639 .map_err(|e| {
640 anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
641 })?
642 .spdx_license_list_version)
643 }
644
645 pub fn detect_with_kind(
646 &self,
647 text: &str,
648 unknown_licenses: bool,
649 binary_derived: bool,
650 ) -> Result<Vec<LicenseDetection>> {
651 self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, 0.0)
652 }
653
654 pub fn detect_with_kind_with_score(
655 &self,
656 text: &str,
657 unknown_licenses: bool,
658 binary_derived: bool,
659 min_score: f32,
660 ) -> Result<Vec<LicenseDetection>> {
661 let clean_text = strip_utf8_bom_str(text);
662
663 let content = truncate_detection_text(clean_text);
664
665 let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
666 let whole_query_run = query.whole_query_run();
667
668 let mut all_matches = Vec::new();
669 let mut candidate_contained_matches = Vec::new();
670 let mut aho_extra_matchables = PositionSet::new();
671 let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
672
673 {
676 let hash_matches = hash_match(&self.index, &whole_query_run);
677
678 if !hash_matches.is_empty() {
679 let mut matches = hash_matches;
680 sort_matches_by_line(&mut matches);
681
682 let groups = group_matches_by_region(&matches);
683 let detections: Vec<LicenseDetection> = groups
684 .iter()
685 .map(|group| {
686 let mut detection = empty_detection();
687 populate_detection_from_group_with_spdx(
688 &mut detection,
689 group,
690 &self.spdx_mapping,
691 Some(content),
692 );
693 detection
694 })
695 .collect();
696
697 return Ok(post_process_detections(detections, min_score));
698 }
699 }
700
701 {
703 let spdx_matches = spdx_lid_match(&self.index, &query);
704 subtract_spdx_match_qspans(
705 &mut query,
706 &mut matched_qspans,
707 &mut aho_extra_matchables,
708 &spdx_matches,
709 );
710 all_matches.extend(spdx_matches);
711 }
712
713 {
715 let aho_matches = if aho_extra_matchables.is_empty() {
716 aho_match(&self.index, &whole_query_run)
717 } else {
718 aho_match::aho_match_with_extra_matchables(
719 &self.index,
720 &whole_query_run,
721 Some(&aho_extra_matchables),
722 )
723 };
724
725 let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
728 candidate_contained_matches.extend(refined_aho.clone());
729 let (merged_aho, _) = merge_and_prepare_aho_matches(
730 &self.index,
731 &mut query,
732 &mut matched_qspans,
733 &refined_aho,
734 );
735 all_matches.extend(merged_aho);
736
737 let whole_query_followup = collect_whole_query_exact_followup_matches(
738 &self.index,
739 &mut query,
740 &mut matched_qspans,
741 &whole_query_run,
742 );
743 all_matches.extend(whole_query_followup);
744
745 let merged_seq = collect_regular_seq_matches(
746 &self.index,
747 &query,
748 &matched_qspans,
749 &candidate_contained_matches,
750 );
751 all_matches.extend(merged_seq);
752 }
753
754 let merged_matches =
757 refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
758
759 let refined_matches = if unknown_licenses {
762 let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
764
765 let unknown_matches = unknown_match(&self.index, &query, &good_matches);
767 let filtered_unknown =
768 filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
769
770 let mut all_matches = good_matches;
771 all_matches.extend(filtered_unknown);
772 all_matches.extend(weak_matches);
775 all_matches
776 } else {
777 merged_matches
778 };
779
780 let refined = refine_matches(&self.index, refined_matches, &query);
782
783 let mut sorted = refined;
784 sort_matches_by_line(&mut sorted);
785
786 let groups = group_matches_by_region(&sorted);
787
788 let detections: Vec<LicenseDetection> = groups
789 .iter()
790 .map(|group| {
791 let mut detection = empty_detection();
792 populate_detection_from_group_with_spdx(
793 &mut detection,
794 group,
795 &self.spdx_mapping,
796 Some(content),
797 );
798 detection
799 })
800 .collect();
801
802 let detections = post_process_detections(detections, min_score);
803
804 Ok(detections)
805 }
806
807 pub fn detect_with_kind_and_source(
808 &self,
809 text: &str,
810 unknown_licenses: bool,
811 binary_derived: bool,
812 source_path: &str,
813 ) -> Result<Vec<LicenseDetection>> {
814 let mut detections = self.detect_with_kind(text, unknown_licenses, binary_derived)?;
815 attach_source_path_to_detections(&mut detections, source_path);
816 Ok(detections)
817 }
818
819 pub fn detect_with_kind_and_source_with_score(
820 &self,
821 text: &str,
822 unknown_licenses: bool,
823 binary_derived: bool,
824 source_path: &str,
825 min_score: f32,
826 ) -> Result<Vec<LicenseDetection>> {
827 let mut detections =
828 self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, min_score)?;
829 attach_source_path_to_detections(&mut detections, source_path);
830 Ok(detections)
831 }
832
833 #[cfg(any(test, feature = "golden-tests"))]
838 pub fn detect_matches_with_kind(
839 &self,
840 text: &str,
841 unknown_licenses: bool,
842 binary_derived: bool,
843 ) -> Result<Vec<LicenseMatch>> {
844 let clean_text = strip_utf8_bom_str(text);
845
846 let content = truncate_detection_text(clean_text);
847
848 let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
849 let whole_query_run = query.whole_query_run();
850
851 let mut all_matches = Vec::new();
852 let mut candidate_contained_matches = Vec::new();
853 let mut aho_extra_matchables = PositionSet::new();
854 let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
855
856 {
858 let hash_matches = hash_match(&self.index, &whole_query_run);
859
860 if !hash_matches.is_empty() {
861 let mut matches = hash_matches;
862 sort_matches_by_line(&mut matches);
863 return Ok(matches);
864 }
865 }
866
867 {
869 let spdx_matches = spdx_lid_match(&self.index, &query);
870 subtract_spdx_match_qspans(
871 &mut query,
872 &mut matched_qspans,
873 &mut aho_extra_matchables,
874 &spdx_matches,
875 );
876 all_matches.extend(spdx_matches);
877 }
878
879 {
881 let aho_matches = if aho_extra_matchables.is_empty() {
882 aho_match(&self.index, &whole_query_run)
883 } else {
884 aho_match::aho_match_with_extra_matchables(
885 &self.index,
886 &whole_query_run,
887 Some(&aho_extra_matchables),
888 )
889 };
890 let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
891 candidate_contained_matches.extend(refined_aho.clone());
892 let (merged_aho, _) = merge_and_prepare_aho_matches(
893 &self.index,
894 &mut query,
895 &mut matched_qspans,
896 &refined_aho,
897 );
898 all_matches.extend(merged_aho);
899
900 let whole_query_followup = collect_whole_query_exact_followup_matches(
901 &self.index,
902 &mut query,
903 &mut matched_qspans,
904 &whole_query_run,
905 );
906 all_matches.extend(whole_query_followup);
907
908 let merged_seq = collect_regular_seq_matches(
909 &self.index,
910 &query,
911 &matched_qspans,
912 &candidate_contained_matches,
913 );
914 all_matches.extend(merged_seq);
915 }
916
917 let merged_matches =
919 refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
920
921 let refined_matches = if unknown_licenses {
923 let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
924 let unknown_matches = unknown_match(&self.index, &query, &good_matches);
925 let filtered_unknown =
926 filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
927
928 let mut all_matches = good_matches;
929 all_matches.extend(filtered_unknown);
930 all_matches.extend(weak_matches);
931 all_matches
932 } else {
933 merged_matches
934 };
935
936 let refined = refine_matches(&self.index, refined_matches, &query);
938
939 let mut sorted = refined;
940 sort_matches_by_line(&mut sorted);
941
942 Ok(sorted)
944 }
945
946 pub fn index(&self) -> &index::LicenseIndex {
948 &self.index
949 }
950
951 pub fn spdx_license_list_version(&self) -> Option<&str> {
952 self.spdx_license_list_version.as_deref()
953 }
954
955 #[cfg(test)]
957 pub fn spdx_mapping(&self) -> &SpdxMapping {
958 &self.spdx_mapping
959 }
960}
961
962pub fn detect_scancode_spdx_license_list_version(search_path: &Path) -> Result<Option<String>> {
963 for ancestor in search_path.ancestors() {
964 let candidate = ancestor.join("scancode_config.py");
965 if candidate.is_file() {
966 let config = fs::read_to_string(&candidate)?;
967 return Ok(parse_scancode_spdx_license_list_version(&config));
968 }
969 }
970
971 Ok(None)
972}
973
974fn parse_scancode_spdx_license_list_version(config: &str) -> Option<String> {
975 config.lines().find_map(|line| {
976 let trimmed = line.trim();
977 let (_, value) = trimmed.split_once('=')?;
978 (trimmed.starts_with("spdx_license_list_version")).then(|| {
979 value
980 .trim()
981 .trim_matches('"')
982 .trim_matches('\'')
983 .to_string()
984 })
985 })
986}
987
988#[cfg(test)]
989mod tests;