1pub mod aho_match;
4pub mod automaton;
5pub(crate) mod detection;
6pub mod embedded;
7pub mod license_cache;
8mod position_set;
9mod token_multiset;
10mod token_set;
11
12#[cfg(test)]
13mod embedded_test;
14pub mod expression;
15#[cfg(all(test, feature = "golden-tests"))]
16mod golden_test;
17#[cfg(feature = "golden-tests")]
18pub mod golden_utils;
19pub mod hash_match;
20pub mod index;
21mod match_refine;
22pub mod models;
23pub mod query;
24pub mod rules;
25pub mod seq_match;
26pub mod spdx_lid;
27pub mod spdx_mapping;
28#[cfg(test)]
29mod test_utils;
30pub mod tokenize;
31pub mod unknown_match;
32
33use bit_set::BitSet;
34use std::collections::HashSet;
35use std::fs;
36use std::path::Path;
37use std::sync::Arc;
38use std::time::Instant;
39
40use anyhow::Result;
41
42use crate::license_detection::embedded::index::{
43 load_embedded_artifact_metadata_from_bytes, load_loader_snapshot_from_bytes,
44};
45use crate::license_detection::index::build_index_from_loaded;
46use crate::license_detection::license_cache::{
47 LicenseCacheConfig, LicenseCacheNamespace, cache_file_size, compute_artifact_fingerprint,
48 compute_rules_fingerprint, delete_cache, load_cached_index, save_cached_index,
49};
50use crate::license_detection::query::Query;
51use crate::license_detection::rules::{
52 load_loaded_licenses_from_directory, load_loaded_rules_from_directory,
53};
54use crate::license_detection::spdx_mapping::{SpdxMapping, build_spdx_mapping};
55use crate::utils::text::strip_utf8_bom_str;
56
57use crate::license_detection::detection::{
58 attach_source_path_to_detections, empty_detection, populate_detection_from_group_with_spdx,
59};
60use crate::license_detection::models::MatcherKind;
61
62#[allow(dead_code)]
65pub const SCANCODE_LICENSES_RULES_PATH: &str =
66 "reference/scancode-toolkit/src/licensedcode/data/rules";
67
68#[allow(dead_code)]
71pub const SCANCODE_LICENSES_LICENSES_PATH: &str =
72 "reference/scancode-toolkit/src/licensedcode/data/licenses";
73
74#[allow(dead_code)]
77pub const SCANCODE_LICENSES_DATA_PATH: &str = "reference/scancode-toolkit/src/licensedcode/data";
78
79pub const DEFAULT_LICENSEDB_URL_TEMPLATE: &str = "https://scancode-licensedb.aboutcode.org/{}";
80
81pub(crate) use detection::{
82 LicenseDetection, group_matches_by_region, post_process_detections, sort_matches_by_line,
83};
84pub use models::LicenseMatch;
85
86pub use aho_match::aho_match;
87pub use hash_match::hash_match;
88pub use match_refine::{
89 filter_invalid_contained_unknown_matches, merge_overlapping_matches, refine_matches,
90 refine_matches_without_false_positive_filter, split_weak_matches,
91};
92pub use position_set::PositionSet;
93pub use spdx_lid::spdx_lid_match;
94pub use token_multiset::TokenMultiset;
95pub use token_set::TokenSet;
96pub use unknown_match::unknown_match;
97
98use self::seq_match::{MAX_NEAR_DUPE_CANDIDATES, select_seq_candidates, seq_match_with_candidates};
99
100#[derive(Debug, Clone)]
106pub struct LicenseDetectionEngine {
107 index: Arc<index::LicenseIndex>,
108 spdx_mapping: SpdxMapping,
109 spdx_license_list_version: Option<String>,
110}
111
112const MAX_DETECTION_SIZE: usize = 10 * 1024 * 1024; const MAX_REGULAR_SEQ_CANDIDATES: usize = 70;
114const MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP: usize = 8;
115const MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP: usize = 2;
116
117fn truncate_detection_text(clean_text: &str) -> &str {
118 if clean_text.len() <= MAX_DETECTION_SIZE {
119 return clean_text;
120 }
121
122 log::debug!(
123 "Content size {} exceeds limit {}, truncating for detection",
124 clean_text.len(),
125 MAX_DETECTION_SIZE
126 );
127
128 let boundary = clean_text.floor_char_boundary(MAX_DETECTION_SIZE);
129 &clean_text[..boundary]
130}
131
132fn query_span_for_match(m: &LicenseMatch) -> Option<models::PositionSpan> {
133 (!m.query_span().is_empty()).then(|| m.query_span().clone())
134}
135
136fn has_full_match_coverage(m: &LicenseMatch) -> bool {
137 m.coverage() == 100.0
138}
139
140fn is_redundant_same_expression_seq_container(
141 container: &LicenseMatch,
142 candidate_contained_matches: &[LicenseMatch],
143) -> bool {
144 let container_is_redundant_coverage =
145 has_full_match_coverage(container) || container.coverage() >= 99.0;
146 if container.matcher != MatcherKind::Seq || !container_is_redundant_coverage {
147 return false;
148 }
149
150 let container_qspan_set = container.qspan_set();
151
152 let mut contained: Vec<&LicenseMatch> = candidate_contained_matches
153 .iter()
154 .filter(|m| {
155 m.matcher == MatcherKind::Aho
156 && has_full_match_coverage(m)
157 && m.license_expression == container.license_expression
158 && m.overlaps_with(&container_qspan_set)
159 })
160 .collect();
161
162 if contained.len() < 2 {
163 return false;
164 }
165
166 let material_children = contained.iter().filter(|m| m.matched_length > 1).count();
167 if material_children < 2 {
168 return false;
169 }
170
171 contained.sort_by_key(|m| m.qspan_bounds());
172
173 let mut child_union = PositionSet::new();
174 for m in &contained {
175 child_union.extend_from_span(m.query_span());
176 }
177
178 let container_only_positions = container_qspan_set.difference(&child_union);
179 let child_only_positions = child_union.difference(&container_qspan_set);
180
181 let mut bridge_positions = BitSet::new();
182 for pair in contained.windows(2) {
183 let (_, previous_end) = pair[0].qspan_bounds();
184 let (next_start, _) = pair[1].qspan_bounds();
185
186 if next_start < previous_end {
187 return false;
188 }
189
190 for pos in previous_end..next_start {
191 bridge_positions.insert(pos);
192 }
193 }
194
195 let container_only_boundary_positions = container_only_positions
196 .iter()
197 .filter(|&pos| !bridge_positions.contains(pos))
198 .count();
199
200 if container_only_positions.len() == 1
201 && container_only_boundary_positions == 0
202 && child_only_positions.is_empty()
203 {
204 return false;
205 }
206
207 if child_only_positions.is_empty()
208 && container_only_positions.len() == container_only_boundary_positions
209 && container_only_boundary_positions <= 3
210 {
211 let earliest_child = contained
212 .iter()
213 .map(|m| m.qspan_bounds().0)
214 .min()
215 .unwrap_or(usize::MAX);
216 let latest_child = contained
217 .iter()
218 .map(|m| m.qspan_bounds().1.saturating_sub(1))
219 .max()
220 .unwrap_or(0);
221
222 let is_one_sided_boundary = container_only_positions
223 .iter()
224 .all(|pos| pos < earliest_child)
225 || container_only_positions
226 .iter()
227 .all(|pos| pos > latest_child);
228
229 if is_one_sided_boundary {
230 return false;
231 }
232 }
233
234 let max_container_only_positions =
235 MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * contained.len() + 1;
236 let max_container_boundary_positions =
237 MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * (contained.len() - 1);
238 let max_child_only_positions = MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP + 1;
239
240 container_only_positions.len() <= max_container_only_positions
241 && container_only_boundary_positions <= max_container_boundary_positions
242 && child_only_positions.len() <= max_child_only_positions
243}
244
245fn filter_redundant_same_expression_seq_containers(
246 seq_matches: Vec<LicenseMatch>,
247 candidate_contained_matches: &[LicenseMatch],
248) -> Vec<LicenseMatch> {
249 seq_matches
250 .into_iter()
251 .filter(|m| !is_redundant_same_expression_seq_container(m, candidate_contained_matches))
252 .collect()
253}
254
255fn is_redundant_low_coverage_composite_seq_wrapper(
256 container: &LicenseMatch,
257 candidate_contained_matches: &[LicenseMatch],
258) -> bool {
259 if container.matcher != seq_match::MATCH_SEQ || container.coverage() >= 30.0 {
260 return false;
261 }
262
263 let container_qspan_set = container.qspan_set();
264
265 let children: Vec<&LicenseMatch> = candidate_contained_matches
266 .iter()
267 .filter(|m| {
268 m.matcher == aho_match::MATCH_AHO
269 && has_full_match_coverage(m)
270 && m.license_expression != container.license_expression
271 && m.overlaps_with(&container_qspan_set)
272 })
273 .collect();
274
275 if children.len() < 2 {
276 return false;
277 }
278
279 let unique_expressions: HashSet<&str> = children
280 .iter()
281 .map(|m| m.license_expression.as_str())
282 .collect();
283 if unique_expressions.len() < 2 {
284 return false;
285 }
286
287 let mut child_union = PositionSet::new();
288 for m in &children {
289 child_union.extend_from_span(m.query_span());
290 }
291
292 let container_only_positions = container_qspan_set.difference(&child_union);
293 let child_only_positions = child_union.difference(&container_qspan_set);
294
295 let mut sorted_children = children;
296 sorted_children.sort_by_key(|m| m.qspan_bounds());
297
298 let mut bridge_positions = BitSet::new();
299 for pair in sorted_children.windows(2) {
300 let (_, previous_end) = pair[0].qspan_bounds();
301 let (next_start, _) = pair[1].qspan_bounds();
302 for pos in previous_end..next_start {
303 bridge_positions.insert(pos);
304 }
305 }
306
307 let container_only_boundary_positions = container_only_positions
308 .iter()
309 .filter(|&pos| !bridge_positions.contains(pos))
310 .count();
311
312 child_only_positions.is_empty()
313 && container_only_positions.len() <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
314 && container_only_boundary_positions <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
315}
316
317fn filter_redundant_low_coverage_composite_seq_wrappers(
318 seq_matches: Vec<LicenseMatch>,
319 candidate_contained_matches: &[LicenseMatch],
320) -> Vec<LicenseMatch> {
321 seq_matches
322 .into_iter()
323 .filter(|m| {
324 !is_redundant_low_coverage_composite_seq_wrapper(m, candidate_contained_matches)
325 })
326 .collect()
327}
328
329fn subtract_spdx_match_qspans(
330 query: &mut Query<'_>,
331 matched_qspans: &mut Vec<models::PositionSpan>,
332 aho_extra_matchables: &mut PositionSet,
333 spdx_matches: &[LicenseMatch],
334) {
335 for m in spdx_matches {
336 let Some(span) = query_span_for_match(m) else {
337 continue;
338 };
339
340 aho_extra_matchables.extend_from_span(&span);
341 query.subtract(&span);
342
343 if has_full_match_coverage(m) {
344 matched_qspans.push(span);
345 }
346 }
347}
348
349fn merge_and_prepare_aho_matches(
350 index: &index::LicenseIndex,
351 query: &mut Query<'_>,
352 matched_qspans: &mut Vec<models::PositionSpan>,
353 refined_aho: &[LicenseMatch],
354) -> (Vec<LicenseMatch>, bool) {
355 let merged_aho = merge_overlapping_matches(refined_aho);
356 let mut saw_long_exact_license_text_match = false;
357
358 for m in &merged_aho {
359 let Some(span) = query_span_for_match(m) else {
360 continue;
361 };
362
363 if has_full_match_coverage(m) {
364 matched_qspans.push(span.clone());
365 }
366
367 if index
368 .rules_by_rid
369 .get(m.rid)
370 .is_some_and(|rule| rule.is_license_text())
371 && m.rule_length > 120
372 && m.coverage() > 98.0
373 {
374 query.subtract(&span);
375 saw_long_exact_license_text_match = true;
376 }
377 }
378
379 (merged_aho, saw_long_exact_license_text_match)
380}
381
382fn collect_whole_query_exact_followup_matches(
383 index: &index::LicenseIndex,
384 query: &mut Query<'_>,
385 matched_qspans: &mut Vec<models::PositionSpan>,
386 whole_run: &query::QueryRun<'_>,
387) -> Vec<LicenseMatch> {
388 let mut seq_all_matches = Vec::new();
389
390 if whole_run.is_matchable(false, matched_qspans) {
391 let near_dupe_candidates =
392 select_seq_candidates(index, whole_run, true, MAX_NEAR_DUPE_CANDIDATES);
393
394 if !near_dupe_candidates.is_empty() {
395 let near_dupe_matches =
396 seq_match_with_candidates(index, whole_run, &near_dupe_candidates);
397
398 for m in &near_dupe_matches {
399 if !m.query_span().is_empty() {
400 let span = m.query_span().clone();
401 query.subtract(&span);
402 matched_qspans.push(span);
403 }
404 }
405
406 seq_all_matches.extend(near_dupe_matches);
407 }
408 }
409
410 seq_all_matches
411}
412
413fn collect_regular_seq_matches(
414 index: &index::LicenseIndex,
415 query: &Query<'_>,
416 matched_qspans: &[models::PositionSpan],
417 candidate_contained_matches: &[LicenseMatch],
418) -> Vec<LicenseMatch> {
419 let mut seq_all_matches = Vec::new();
420
421 for query_run in query.query_runs() {
422 if !query_run.is_matchable(false, matched_qspans) {
423 continue;
424 }
425
426 let candidates =
427 select_seq_candidates(index, &query_run, false, MAX_REGULAR_SEQ_CANDIDATES);
428 if !candidates.is_empty() {
429 let matches = seq_match_with_candidates(index, &query_run, &candidates);
430 seq_all_matches.extend(matches);
431 }
432 }
433
434 let merged_seq = merge_overlapping_matches(&seq_all_matches);
435 let filtered_same_expression =
436 filter_redundant_same_expression_seq_containers(merged_seq, candidate_contained_matches);
437 filter_redundant_low_coverage_composite_seq_wrappers(
438 filtered_same_expression,
439 candidate_contained_matches,
440 )
441}
442
443impl LicenseDetectionEngine {
444 fn from_index(
449 index: index::LicenseIndex,
450 spdx_license_list_version: Option<String>,
451 ) -> Result<Self> {
452 let mut license_vec: Vec<_> = index.licenses_by_key.values().cloned().collect();
453 license_vec.sort_by(|a, b| a.key.cmp(&b.key));
454 let spdx_mapping = build_spdx_mapping(&license_vec);
455
456 Ok(Self {
457 index: Arc::new(index),
458 spdx_mapping,
459 spdx_license_list_version,
460 })
461 }
462
463 pub fn from_embedded() -> Result<Self> {
468 let cache_config =
469 LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
470 Self::from_embedded_with_cache(&cache_config)
471 }
472
473 pub fn from_embedded_with_cache(cache_config: &LicenseCacheConfig) -> Result<Self> {
488 let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
489 let fingerprint = compute_artifact_fingerprint(artifact_bytes);
490
491 if !cache_config.reindex {
492 if let Some(cached) =
493 load_cached_index(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?
494 {
495 let start = Instant::now();
496 let spdx_version = cached.spdx_license_list_version.clone();
497 eprintln!(
498 "License index loaded from rkyv cache in {:.2}s",
499 start.elapsed().as_secs_f64()
500 );
501 return Self::from_index(cached, spdx_version);
502 }
503 } else {
504 delete_cache(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)?;
505 }
506
507 let snapshot = load_loader_snapshot_from_bytes(artifact_bytes)
508 .map_err(|e| anyhow::anyhow!("Failed to load embedded license index: {}", e))?;
509 let spdx_version = Some(snapshot.metadata.spdx_license_list_version.clone());
510
511 let start = Instant::now();
512 let index = build_index_from_loaded(snapshot.rules, snapshot.licenses, false);
513 eprintln!(
514 "License index built from embedded artifact in {:.2}s",
515 start.elapsed().as_secs_f64()
516 );
517
518 let mut index = index;
519 index.spdx_license_list_version = spdx_version.clone();
520 if let Err(e) = save_cached_index(
521 cache_config,
522 LicenseCacheNamespace::Embedded,
523 &index,
524 &fingerprint,
525 ) {
526 eprintln!("Warning: failed to save license index cache: {}", e);
527 } else if let Some(size) =
528 cache_file_size(cache_config, LicenseCacheNamespace::Embedded, &fingerprint)
529 {
530 eprintln!(
531 "License index cache saved ({:.1} MB)",
532 size as f64 / 1_048_576.0
533 );
534 }
535
536 Self::from_index(index, spdx_version)
537 }
538
539 pub fn from_directory(rules_path: &Path) -> Result<Self> {
544 let cache_config =
545 LicenseCacheConfig::new(LicenseCacheConfig::default_root_dir(), false, true);
546 Self::from_directory_with_cache(rules_path, &cache_config)
547 }
548
549 pub fn from_directory_with_cache(
561 rules_path: &Path,
562 cache_config: &LicenseCacheConfig,
563 ) -> Result<Self> {
564 let (rules_dir, licenses_dir) = if rules_path.ends_with("data") {
565 (rules_path.join("rules"), rules_path.join("licenses"))
566 } else if rules_path.ends_with("rules") {
567 let parent = rules_path.parent().ok_or_else(|| {
568 anyhow::anyhow!("Cannot determine parent directory for rules path")
569 })?;
570 (rules_path.to_path_buf(), parent.join("licenses"))
571 } else {
572 (rules_path.to_path_buf(), rules_path.to_path_buf())
573 };
574
575 let loaded_rules = load_loaded_rules_from_directory(&rules_dir)?;
576 let loaded_licenses = load_loaded_licenses_from_directory(&licenses_dir)?;
577
578 let fingerprint = compute_rules_fingerprint(&loaded_rules, &loaded_licenses);
579
580 if !cache_config.reindex {
581 if let Some(cached) = load_cached_index(
582 cache_config,
583 LicenseCacheNamespace::CustomRules,
584 &fingerprint,
585 )? {
586 let start = Instant::now();
587 eprintln!(
588 "License index loaded from rkyv cache in {:.2}s",
589 start.elapsed().as_secs_f64()
590 );
591 let spdx_version = detect_scancode_spdx_license_list_version(&rules_dir)?;
592 return Self::from_index(cached, spdx_version);
593 }
594 } else {
595 delete_cache(
596 cache_config,
597 LicenseCacheNamespace::CustomRules,
598 &fingerprint,
599 )?;
600 }
601
602 let start = Instant::now();
603 let index = build_index_from_loaded(loaded_rules, loaded_licenses, false);
604 eprintln!(
605 "License index built from rules directory in {:.2}s",
606 start.elapsed().as_secs_f64()
607 );
608
609 let spdx_license_list_version = detect_scancode_spdx_license_list_version(&rules_dir)?;
610
611 if let Err(e) = save_cached_index(
612 cache_config,
613 LicenseCacheNamespace::CustomRules,
614 &index,
615 &fingerprint,
616 ) {
617 eprintln!("Warning: failed to save license index cache: {}", e);
618 } else if let Some(size) = cache_file_size(
619 cache_config,
620 LicenseCacheNamespace::CustomRules,
621 &fingerprint,
622 ) {
623 eprintln!(
624 "License index cache saved ({:.1} MB)",
625 size as f64 / 1_048_576.0
626 );
627 }
628
629 Self::from_index(index, spdx_license_list_version)
630 }
631
632 pub fn embedded_spdx_license_list_version() -> Result<String> {
633 let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
634 Ok(load_embedded_artifact_metadata_from_bytes(artifact_bytes)
635 .map_err(|e| {
636 anyhow::anyhow!("Failed to load embedded license artifact metadata: {}", e)
637 })?
638 .spdx_license_list_version)
639 }
640
641 pub fn detect_with_kind(
642 &self,
643 text: &str,
644 unknown_licenses: bool,
645 binary_derived: bool,
646 ) -> Result<Vec<LicenseDetection>> {
647 self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, 0.0)
648 }
649
650 pub fn detect_with_kind_with_score(
651 &self,
652 text: &str,
653 unknown_licenses: bool,
654 binary_derived: bool,
655 min_score: f32,
656 ) -> Result<Vec<LicenseDetection>> {
657 let clean_text = strip_utf8_bom_str(text);
658
659 let content = truncate_detection_text(clean_text);
660
661 let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
662 let whole_query_run = query.whole_query_run();
663
664 let mut all_matches = Vec::new();
665 let mut candidate_contained_matches = Vec::new();
666 let mut aho_extra_matchables = PositionSet::new();
667 let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
668
669 {
672 let hash_matches = hash_match(&self.index, &whole_query_run);
673
674 if !hash_matches.is_empty() {
675 let mut matches = hash_matches;
676 sort_matches_by_line(&mut matches);
677
678 let groups = group_matches_by_region(&matches);
679 let detections: Vec<LicenseDetection> = groups
680 .iter()
681 .map(|group| {
682 let mut detection = empty_detection();
683 populate_detection_from_group_with_spdx(
684 &mut detection,
685 group,
686 &self.spdx_mapping,
687 Some(content),
688 );
689 detection
690 })
691 .collect();
692
693 return Ok(post_process_detections(detections, min_score));
694 }
695 }
696
697 {
699 let spdx_matches = spdx_lid_match(&self.index, &query);
700 subtract_spdx_match_qspans(
701 &mut query,
702 &mut matched_qspans,
703 &mut aho_extra_matchables,
704 &spdx_matches,
705 );
706 all_matches.extend(spdx_matches);
707 }
708
709 {
711 let aho_matches = if aho_extra_matchables.is_empty() {
712 aho_match(&self.index, &whole_query_run)
713 } else {
714 aho_match::aho_match_with_extra_matchables(
715 &self.index,
716 &whole_query_run,
717 Some(&aho_extra_matchables),
718 )
719 };
720
721 let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
724 candidate_contained_matches.extend(refined_aho.clone());
725 let (merged_aho, _) = merge_and_prepare_aho_matches(
726 &self.index,
727 &mut query,
728 &mut matched_qspans,
729 &refined_aho,
730 );
731 all_matches.extend(merged_aho);
732
733 let whole_query_followup = collect_whole_query_exact_followup_matches(
734 &self.index,
735 &mut query,
736 &mut matched_qspans,
737 &whole_query_run,
738 );
739 all_matches.extend(whole_query_followup);
740
741 let merged_seq = collect_regular_seq_matches(
742 &self.index,
743 &query,
744 &matched_qspans,
745 &candidate_contained_matches,
746 );
747 all_matches.extend(merged_seq);
748 }
749
750 let merged_matches =
753 refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
754
755 let refined_matches = if unknown_licenses {
758 let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
760
761 let unknown_matches = unknown_match(&self.index, &query, &good_matches);
763 let filtered_unknown =
764 filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
765
766 let mut all_matches = good_matches;
767 all_matches.extend(filtered_unknown);
768 all_matches.extend(weak_matches);
771 all_matches
772 } else {
773 merged_matches
774 };
775
776 let refined = refine_matches(&self.index, refined_matches, &query);
778
779 let mut sorted = refined;
780 sort_matches_by_line(&mut sorted);
781
782 let groups = group_matches_by_region(&sorted);
783
784 let detections: Vec<LicenseDetection> = groups
785 .iter()
786 .map(|group| {
787 let mut detection = empty_detection();
788 populate_detection_from_group_with_spdx(
789 &mut detection,
790 group,
791 &self.spdx_mapping,
792 Some(content),
793 );
794 detection
795 })
796 .collect();
797
798 let detections = post_process_detections(detections, min_score);
799
800 Ok(detections)
801 }
802
803 pub fn detect_with_kind_and_source(
804 &self,
805 text: &str,
806 unknown_licenses: bool,
807 binary_derived: bool,
808 source_path: &str,
809 ) -> Result<Vec<LicenseDetection>> {
810 let mut detections = self.detect_with_kind(text, unknown_licenses, binary_derived)?;
811 attach_source_path_to_detections(&mut detections, source_path);
812 Ok(detections)
813 }
814
815 pub fn detect_with_kind_and_source_with_score(
816 &self,
817 text: &str,
818 unknown_licenses: bool,
819 binary_derived: bool,
820 source_path: &str,
821 min_score: f32,
822 ) -> Result<Vec<LicenseDetection>> {
823 let mut detections =
824 self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, min_score)?;
825 attach_source_path_to_detections(&mut detections, source_path);
826 Ok(detections)
827 }
828
829 #[cfg(any(test, feature = "golden-tests"))]
834 pub fn detect_matches_with_kind(
835 &self,
836 text: &str,
837 unknown_licenses: bool,
838 binary_derived: bool,
839 ) -> Result<Vec<LicenseMatch>> {
840 let clean_text = strip_utf8_bom_str(text);
841
842 let content = truncate_detection_text(clean_text);
843
844 let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
845 let whole_query_run = query.whole_query_run();
846
847 let mut all_matches = Vec::new();
848 let mut candidate_contained_matches = Vec::new();
849 let mut aho_extra_matchables = PositionSet::new();
850 let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
851
852 {
854 let hash_matches = hash_match(&self.index, &whole_query_run);
855
856 if !hash_matches.is_empty() {
857 let mut matches = hash_matches;
858 sort_matches_by_line(&mut matches);
859 return Ok(matches);
860 }
861 }
862
863 {
865 let spdx_matches = spdx_lid_match(&self.index, &query);
866 subtract_spdx_match_qspans(
867 &mut query,
868 &mut matched_qspans,
869 &mut aho_extra_matchables,
870 &spdx_matches,
871 );
872 all_matches.extend(spdx_matches);
873 }
874
875 {
877 let aho_matches = if aho_extra_matchables.is_empty() {
878 aho_match(&self.index, &whole_query_run)
879 } else {
880 aho_match::aho_match_with_extra_matchables(
881 &self.index,
882 &whole_query_run,
883 Some(&aho_extra_matchables),
884 )
885 };
886 let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
887 candidate_contained_matches.extend(refined_aho.clone());
888 let (merged_aho, _) = merge_and_prepare_aho_matches(
889 &self.index,
890 &mut query,
891 &mut matched_qspans,
892 &refined_aho,
893 );
894 all_matches.extend(merged_aho);
895
896 let whole_query_followup = collect_whole_query_exact_followup_matches(
897 &self.index,
898 &mut query,
899 &mut matched_qspans,
900 &whole_query_run,
901 );
902 all_matches.extend(whole_query_followup);
903
904 let merged_seq = collect_regular_seq_matches(
905 &self.index,
906 &query,
907 &matched_qspans,
908 &candidate_contained_matches,
909 );
910 all_matches.extend(merged_seq);
911 }
912
913 let merged_matches =
915 refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
916
917 let refined_matches = if unknown_licenses {
919 let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
920 let unknown_matches = unknown_match(&self.index, &query, &good_matches);
921 let filtered_unknown =
922 filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
923
924 let mut all_matches = good_matches;
925 all_matches.extend(filtered_unknown);
926 all_matches.extend(weak_matches);
927 all_matches
928 } else {
929 merged_matches
930 };
931
932 let refined = refine_matches(&self.index, refined_matches, &query);
934
935 let mut sorted = refined;
936 sort_matches_by_line(&mut sorted);
937
938 Ok(sorted)
940 }
941
942 pub fn index(&self) -> &index::LicenseIndex {
944 &self.index
945 }
946
947 pub fn spdx_license_list_version(&self) -> Option<&str> {
948 self.spdx_license_list_version.as_deref()
949 }
950
951 #[cfg(test)]
953 pub fn spdx_mapping(&self) -> &SpdxMapping {
954 &self.spdx_mapping
955 }
956}
957
958pub fn detect_scancode_spdx_license_list_version(search_path: &Path) -> Result<Option<String>> {
959 for ancestor in search_path.ancestors() {
960 let candidate = ancestor.join("scancode_config.py");
961 if candidate.is_file() {
962 let config = fs::read_to_string(&candidate)?;
963 return Ok(parse_scancode_spdx_license_list_version(&config));
964 }
965 }
966
967 Ok(None)
968}
969
970fn parse_scancode_spdx_license_list_version(config: &str) -> Option<String> {
971 config.lines().find_map(|line| {
972 let trimmed = line.trim();
973 let (_, value) = trimmed.split_once('=')?;
974 (trimmed.starts_with("spdx_license_list_version")).then(|| {
975 value
976 .trim()
977 .trim_matches('"')
978 .trim_matches('\'')
979 .to_string()
980 })
981 })
982}
983
984#[cfg(test)]
985mod tests;