1pub mod aho_match;
4pub mod automaton;
5pub(crate) mod detection;
6pub mod embedded;
7mod position_set;
8mod token_multiset;
9mod token_set;
10
11#[cfg(test)]
12mod embedded_test;
13pub mod expression;
14#[cfg(test)]
15mod golden_test;
16#[cfg(feature = "golden-tests")]
17pub mod golden_utils;
18pub mod hash_match;
19pub mod index;
20mod match_refine;
21pub mod models;
22pub mod query;
23pub mod rules;
24pub mod seq_match;
25pub mod spdx_lid;
26pub mod spdx_mapping;
27#[cfg(test)]
28mod test_utils;
29pub mod tokenize;
30pub mod unknown_match;
31
32use bit_set::BitSet;
33use std::collections::HashSet;
34use std::path::Path;
35use std::sync::Arc;
36
37use anyhow::Result;
38
39use crate::license_detection::embedded::index::load_license_index_from_bytes;
40use crate::license_detection::index::build_index_from_loaded;
41use crate::license_detection::query::Query;
42use crate::license_detection::rules::{
43 load_loaded_licenses_from_directory, load_loaded_rules_from_directory,
44};
45use crate::license_detection::spdx_mapping::{SpdxMapping, build_spdx_mapping};
46use crate::utils::text::strip_utf8_bom_str;
47
48use crate::license_detection::detection::{
49 attach_source_path_to_detections, empty_detection, populate_detection_from_group_with_spdx,
50};
51use crate::license_detection::models::MatcherKind;
52
53#[allow(dead_code)]
56pub const SCANCODE_LICENSES_RULES_PATH: &str =
57 "reference/scancode-toolkit/src/licensedcode/data/rules";
58
59#[allow(dead_code)]
62pub const SCANCODE_LICENSES_LICENSES_PATH: &str =
63 "reference/scancode-toolkit/src/licensedcode/data/licenses";
64
65#[allow(dead_code)]
68pub const SCANCODE_LICENSES_DATA_PATH: &str = "reference/scancode-toolkit/src/licensedcode/data";
69
70pub const DEFAULT_LICENSEDB_URL_TEMPLATE: &str = "https://scancode-licensedb.aboutcode.org/{}";
71
72pub(crate) use detection::{
73 LicenseDetection, group_matches_by_region, post_process_detections, sort_matches_by_line,
74};
75pub use models::LicenseMatch;
76
77pub use aho_match::aho_match;
78pub use hash_match::hash_match;
79pub use match_refine::{
80 filter_invalid_contained_unknown_matches, merge_overlapping_matches, refine_matches,
81 refine_matches_without_false_positive_filter, split_weak_matches,
82};
83pub use position_set::PositionSet;
84pub use spdx_lid::spdx_lid_match;
85pub use token_multiset::TokenMultiset;
86pub use token_set::TokenSet;
87pub use unknown_match::unknown_match;
88
89use self::seq_match::{MAX_NEAR_DUPE_CANDIDATES, select_seq_candidates, seq_match_with_candidates};
90
91#[derive(Debug, Clone)]
97pub struct LicenseDetectionEngine {
98 index: Arc<index::LicenseIndex>,
99 spdx_mapping: SpdxMapping,
100}
101
102const MAX_DETECTION_SIZE: usize = 10 * 1024 * 1024; const MAX_REGULAR_SEQ_CANDIDATES: usize = 70;
104const MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP: usize = 8;
105const MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP: usize = 2;
106
107fn truncate_detection_text(clean_text: &str) -> &str {
108 if clean_text.len() <= MAX_DETECTION_SIZE {
109 return clean_text;
110 }
111
112 log::warn!(
113 "Content size {} exceeds limit {}, truncating for detection",
114 clean_text.len(),
115 MAX_DETECTION_SIZE
116 );
117
118 let boundary = clean_text.floor_char_boundary(MAX_DETECTION_SIZE);
119 &clean_text[..boundary]
120}
121
122fn query_span_for_match(m: &LicenseMatch) -> Option<models::PositionSpan> {
123 (!m.query_span().is_empty()).then(|| m.query_span().clone())
124}
125
126fn has_full_match_coverage(m: &LicenseMatch) -> bool {
127 m.coverage() == 100.0
128}
129
130fn is_redundant_same_expression_seq_container(
131 container: &LicenseMatch,
132 candidate_contained_matches: &[LicenseMatch],
133) -> bool {
134 let container_is_redundant_coverage =
135 has_full_match_coverage(container) || container.coverage() >= 99.0;
136 if container.matcher != MatcherKind::Seq || !container_is_redundant_coverage {
137 return false;
138 }
139
140 let container_qspan_set = container.qspan_set();
141
142 let mut contained: Vec<&LicenseMatch> = candidate_contained_matches
143 .iter()
144 .filter(|m| {
145 m.matcher == MatcherKind::Aho
146 && has_full_match_coverage(m)
147 && m.license_expression == container.license_expression
148 && m.overlaps_with(&container_qspan_set)
149 })
150 .collect();
151
152 if contained.len() < 2 {
153 return false;
154 }
155
156 let material_children = contained.iter().filter(|m| m.matched_length > 1).count();
157 if material_children < 2 {
158 return false;
159 }
160
161 contained.sort_by_key(|m| m.qspan_bounds());
162
163 let mut child_union = PositionSet::new();
164 for m in &contained {
165 child_union.extend_from_span(m.query_span());
166 }
167
168 let container_only_positions = container_qspan_set.difference(&child_union);
169 let child_only_positions = child_union.difference(&container_qspan_set);
170
171 let mut bridge_positions = BitSet::new();
172 for pair in contained.windows(2) {
173 let (_, previous_end) = pair[0].qspan_bounds();
174 let (next_start, _) = pair[1].qspan_bounds();
175
176 if next_start < previous_end {
177 return false;
178 }
179
180 for pos in previous_end..next_start {
181 bridge_positions.insert(pos);
182 }
183 }
184
185 let container_only_boundary_positions = container_only_positions
186 .iter()
187 .filter(|&pos| !bridge_positions.contains(pos))
188 .count();
189
190 if container_only_positions.len() == 1
191 && container_only_boundary_positions == 0
192 && child_only_positions.is_empty()
193 {
194 return false;
195 }
196
197 if child_only_positions.is_empty()
198 && container_only_positions.len() == container_only_boundary_positions
199 && container_only_boundary_positions <= 3
200 {
201 let earliest_child = contained
202 .iter()
203 .map(|m| m.qspan_bounds().0)
204 .min()
205 .unwrap_or(usize::MAX);
206 let latest_child = contained
207 .iter()
208 .map(|m| m.qspan_bounds().1.saturating_sub(1))
209 .max()
210 .unwrap_or(0);
211
212 let is_one_sided_boundary = container_only_positions
213 .iter()
214 .all(|pos| pos < earliest_child)
215 || container_only_positions
216 .iter()
217 .all(|pos| pos > latest_child);
218
219 if is_one_sided_boundary {
220 return false;
221 }
222 }
223
224 let max_container_only_positions =
225 MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * contained.len() + 1;
226 let max_container_boundary_positions =
227 MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * (contained.len() - 1);
228 let max_child_only_positions = MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP + 1;
229
230 container_only_positions.len() <= max_container_only_positions
231 && container_only_boundary_positions <= max_container_boundary_positions
232 && child_only_positions.len() <= max_child_only_positions
233}
234
235fn filter_redundant_same_expression_seq_containers(
236 seq_matches: Vec<LicenseMatch>,
237 candidate_contained_matches: &[LicenseMatch],
238) -> Vec<LicenseMatch> {
239 seq_matches
240 .into_iter()
241 .filter(|m| !is_redundant_same_expression_seq_container(m, candidate_contained_matches))
242 .collect()
243}
244
245fn is_redundant_low_coverage_composite_seq_wrapper(
246 container: &LicenseMatch,
247 candidate_contained_matches: &[LicenseMatch],
248) -> bool {
249 if container.matcher != seq_match::MATCH_SEQ || container.coverage() >= 30.0 {
250 return false;
251 }
252
253 let container_qspan_set = container.qspan_set();
254
255 let children: Vec<&LicenseMatch> = candidate_contained_matches
256 .iter()
257 .filter(|m| {
258 m.matcher == aho_match::MATCH_AHO
259 && has_full_match_coverage(m)
260 && m.license_expression != container.license_expression
261 && m.overlaps_with(&container_qspan_set)
262 })
263 .collect();
264
265 if children.len() < 2 {
266 return false;
267 }
268
269 let unique_expressions: HashSet<&str> = children
270 .iter()
271 .map(|m| m.license_expression.as_str())
272 .collect();
273 if unique_expressions.len() < 2 {
274 return false;
275 }
276
277 let mut child_union = PositionSet::new();
278 for m in &children {
279 child_union.extend_from_span(m.query_span());
280 }
281
282 let container_only_positions = container_qspan_set.difference(&child_union);
283 let child_only_positions = child_union.difference(&container_qspan_set);
284
285 let mut sorted_children = children;
286 sorted_children.sort_by_key(|m| m.qspan_bounds());
287
288 let mut bridge_positions = BitSet::new();
289 for pair in sorted_children.windows(2) {
290 let (_, previous_end) = pair[0].qspan_bounds();
291 let (next_start, _) = pair[1].qspan_bounds();
292 for pos in previous_end..next_start {
293 bridge_positions.insert(pos);
294 }
295 }
296
297 let container_only_boundary_positions = container_only_positions
298 .iter()
299 .filter(|&pos| !bridge_positions.contains(pos))
300 .count();
301
302 child_only_positions.is_empty()
303 && container_only_positions.len() <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
304 && container_only_boundary_positions <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
305}
306
307fn filter_redundant_low_coverage_composite_seq_wrappers(
308 seq_matches: Vec<LicenseMatch>,
309 candidate_contained_matches: &[LicenseMatch],
310) -> Vec<LicenseMatch> {
311 seq_matches
312 .into_iter()
313 .filter(|m| {
314 !is_redundant_low_coverage_composite_seq_wrapper(m, candidate_contained_matches)
315 })
316 .collect()
317}
318
319fn subtract_spdx_match_qspans(
320 query: &mut Query<'_>,
321 matched_qspans: &mut Vec<models::PositionSpan>,
322 aho_extra_matchables: &mut PositionSet,
323 spdx_matches: &[LicenseMatch],
324) {
325 for m in spdx_matches {
326 let Some(span) = query_span_for_match(m) else {
327 continue;
328 };
329
330 aho_extra_matchables.extend_from_span(&span);
331 query.subtract(&span);
332
333 if has_full_match_coverage(m) {
334 matched_qspans.push(span);
335 }
336 }
337}
338
339fn merge_and_prepare_aho_matches(
340 index: &index::LicenseIndex,
341 query: &mut Query<'_>,
342 matched_qspans: &mut Vec<models::PositionSpan>,
343 refined_aho: &[LicenseMatch],
344) -> (Vec<LicenseMatch>, bool) {
345 let merged_aho = merge_overlapping_matches(refined_aho);
346 let mut saw_long_exact_license_text_match = false;
347
348 for m in &merged_aho {
349 let Some(span) = query_span_for_match(m) else {
350 continue;
351 };
352
353 if has_full_match_coverage(m) {
354 matched_qspans.push(span.clone());
355 }
356
357 if index
358 .rules_by_rid
359 .get(m.rid)
360 .is_some_and(|rule| rule.is_license_text())
361 && m.rule_length > 120
362 && m.coverage() > 98.0
363 {
364 query.subtract(&span);
365 saw_long_exact_license_text_match = true;
366 }
367 }
368
369 (merged_aho, saw_long_exact_license_text_match)
370}
371
372fn collect_whole_query_exact_followup_matches(
373 index: &index::LicenseIndex,
374 query: &mut Query<'_>,
375 matched_qspans: &mut Vec<models::PositionSpan>,
376 whole_run: &query::QueryRun<'_>,
377) -> Vec<LicenseMatch> {
378 let mut seq_all_matches = Vec::new();
379
380 if whole_run.is_matchable(false, matched_qspans) {
381 let near_dupe_candidates =
382 select_seq_candidates(index, whole_run, true, MAX_NEAR_DUPE_CANDIDATES);
383
384 if !near_dupe_candidates.is_empty() {
385 let near_dupe_matches =
386 seq_match_with_candidates(index, whole_run, &near_dupe_candidates);
387
388 for m in &near_dupe_matches {
389 if !m.query_span().is_empty() {
390 let span = m.query_span().clone();
391 query.subtract(&span);
392 matched_qspans.push(span);
393 }
394 }
395
396 seq_all_matches.extend(near_dupe_matches);
397 }
398 }
399
400 seq_all_matches
401}
402
403fn collect_regular_seq_matches(
404 index: &index::LicenseIndex,
405 query: &Query<'_>,
406 matched_qspans: &[models::PositionSpan],
407 candidate_contained_matches: &[LicenseMatch],
408) -> Vec<LicenseMatch> {
409 let mut seq_all_matches = Vec::new();
410
411 for query_run in query.query_runs() {
412 if !query_run.is_matchable(false, matched_qspans) {
413 continue;
414 }
415
416 let candidates =
417 select_seq_candidates(index, &query_run, false, MAX_REGULAR_SEQ_CANDIDATES);
418 if !candidates.is_empty() {
419 let matches = seq_match_with_candidates(index, &query_run, &candidates);
420 seq_all_matches.extend(matches);
421 }
422 }
423
424 let merged_seq = merge_overlapping_matches(&seq_all_matches);
425 let filtered_same_expression =
426 filter_redundant_same_expression_seq_containers(merged_seq, candidate_contained_matches);
427 filter_redundant_low_coverage_composite_seq_wrappers(
428 filtered_same_expression,
429 candidate_contained_matches,
430 )
431}
432
433impl LicenseDetectionEngine {
434 fn from_index(index: index::LicenseIndex) -> Result<Self> {
439 let mut license_vec: Vec<_> = index.licenses_by_key.values().cloned().collect();
440 license_vec.sort_by(|a, b| a.key.cmp(&b.key));
441 let spdx_mapping = build_spdx_mapping(&license_vec);
442
443 Ok(Self {
444 index: Arc::new(index),
445 spdx_mapping,
446 })
447 }
448
449 pub fn from_embedded() -> Result<Self> {
458 let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
459 let index = load_license_index_from_bytes(artifact_bytes)
460 .map_err(|e| anyhow::anyhow!("Failed to load embedded license index: {}", e))?;
461 Self::from_index(index)
462 }
463
464 pub fn from_directory(rules_path: &Path) -> Result<Self> {
472 let (rules_dir, licenses_dir) = if rules_path.ends_with("data") {
473 (rules_path.join("rules"), rules_path.join("licenses"))
474 } else if rules_path.ends_with("rules") {
475 let parent = rules_path.parent().ok_or_else(|| {
476 anyhow::anyhow!("Cannot determine parent directory for rules path")
477 })?;
478 (rules_path.to_path_buf(), parent.join("licenses"))
479 } else {
480 (rules_path.to_path_buf(), rules_path.to_path_buf())
481 };
482
483 let loaded_rules = load_loaded_rules_from_directory(&rules_dir)?;
484 let loaded_licenses = load_loaded_licenses_from_directory(&licenses_dir)?;
485 let index = build_index_from_loaded(loaded_rules, loaded_licenses, false);
486
487 Self::from_index(index)
488 }
489
490 pub fn detect_with_kind(
491 &self,
492 text: &str,
493 unknown_licenses: bool,
494 binary_derived: bool,
495 ) -> Result<Vec<LicenseDetection>> {
496 self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, 0.0)
497 }
498
499 pub fn detect_with_kind_with_score(
500 &self,
501 text: &str,
502 unknown_licenses: bool,
503 binary_derived: bool,
504 min_score: f32,
505 ) -> Result<Vec<LicenseDetection>> {
506 let clean_text = strip_utf8_bom_str(text);
507
508 let content = truncate_detection_text(clean_text);
509
510 let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
511 let whole_query_run = query.whole_query_run();
512
513 let mut all_matches = Vec::new();
514 let mut candidate_contained_matches = Vec::new();
515 let mut aho_extra_matchables = PositionSet::new();
516 let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
517
518 {
521 let hash_matches = hash_match(&self.index, &whole_query_run);
522
523 if !hash_matches.is_empty() {
524 let mut matches = hash_matches;
525 sort_matches_by_line(&mut matches);
526
527 let groups = group_matches_by_region(&matches);
528 let detections: Vec<LicenseDetection> = groups
529 .iter()
530 .map(|group| {
531 let mut detection = empty_detection();
532 populate_detection_from_group_with_spdx(
533 &mut detection,
534 group,
535 &self.spdx_mapping,
536 Some(content),
537 );
538 detection
539 })
540 .collect();
541
542 return Ok(post_process_detections(detections, min_score));
543 }
544 }
545
546 {
548 let spdx_matches = spdx_lid_match(&self.index, &query);
549 subtract_spdx_match_qspans(
550 &mut query,
551 &mut matched_qspans,
552 &mut aho_extra_matchables,
553 &spdx_matches,
554 );
555 all_matches.extend(spdx_matches);
556 }
557
558 {
560 let aho_matches = if aho_extra_matchables.is_empty() {
561 aho_match(&self.index, &whole_query_run)
562 } else {
563 aho_match::aho_match_with_extra_matchables(
564 &self.index,
565 &whole_query_run,
566 Some(&aho_extra_matchables),
567 )
568 };
569
570 let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
573 candidate_contained_matches.extend(refined_aho.clone());
574 let (merged_aho, _) = merge_and_prepare_aho_matches(
575 &self.index,
576 &mut query,
577 &mut matched_qspans,
578 &refined_aho,
579 );
580 all_matches.extend(merged_aho);
581
582 let whole_query_followup = collect_whole_query_exact_followup_matches(
583 &self.index,
584 &mut query,
585 &mut matched_qspans,
586 &whole_query_run,
587 );
588 all_matches.extend(whole_query_followup);
589
590 let merged_seq = collect_regular_seq_matches(
591 &self.index,
592 &query,
593 &matched_qspans,
594 &candidate_contained_matches,
595 );
596 all_matches.extend(merged_seq);
597 }
598
599 let merged_matches =
602 refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
603
604 let refined_matches = if unknown_licenses {
607 let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
609
610 let unknown_matches = unknown_match(&self.index, &query, &good_matches);
612 let filtered_unknown =
613 filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
614
615 let mut all_matches = good_matches;
616 all_matches.extend(filtered_unknown);
617 all_matches.extend(weak_matches);
620 all_matches
621 } else {
622 merged_matches
623 };
624
625 let refined = refine_matches(&self.index, refined_matches, &query);
627
628 let mut sorted = refined;
629 sort_matches_by_line(&mut sorted);
630
631 let groups = group_matches_by_region(&sorted);
632
633 let detections: Vec<LicenseDetection> = groups
634 .iter()
635 .map(|group| {
636 let mut detection = empty_detection();
637 populate_detection_from_group_with_spdx(
638 &mut detection,
639 group,
640 &self.spdx_mapping,
641 Some(content),
642 );
643 detection
644 })
645 .collect();
646
647 let detections = post_process_detections(detections, min_score);
648
649 Ok(detections)
650 }
651
652 pub fn detect_with_kind_and_source(
653 &self,
654 text: &str,
655 unknown_licenses: bool,
656 binary_derived: bool,
657 source_path: &str,
658 ) -> Result<Vec<LicenseDetection>> {
659 let mut detections = self.detect_with_kind(text, unknown_licenses, binary_derived)?;
660 attach_source_path_to_detections(&mut detections, source_path);
661 Ok(detections)
662 }
663
664 pub fn detect_with_kind_and_source_with_score(
665 &self,
666 text: &str,
667 unknown_licenses: bool,
668 binary_derived: bool,
669 source_path: &str,
670 min_score: f32,
671 ) -> Result<Vec<LicenseDetection>> {
672 let mut detections =
673 self.detect_with_kind_with_score(text, unknown_licenses, binary_derived, min_score)?;
674 attach_source_path_to_detections(&mut detections, source_path);
675 Ok(detections)
676 }
677
678 #[cfg(any(test, feature = "golden-tests"))]
683 pub fn detect_matches_with_kind(
684 &self,
685 text: &str,
686 unknown_licenses: bool,
687 binary_derived: bool,
688 ) -> Result<Vec<LicenseMatch>> {
689 let clean_text = strip_utf8_bom_str(text);
690
691 let content = truncate_detection_text(clean_text);
692
693 let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
694 let whole_query_run = query.whole_query_run();
695
696 let mut all_matches = Vec::new();
697 let mut candidate_contained_matches = Vec::new();
698 let mut aho_extra_matchables = PositionSet::new();
699 let mut matched_qspans: Vec<models::PositionSpan> = Vec::new();
700
701 {
703 let hash_matches = hash_match(&self.index, &whole_query_run);
704
705 if !hash_matches.is_empty() {
706 let mut matches = hash_matches;
707 sort_matches_by_line(&mut matches);
708 return Ok(matches);
709 }
710 }
711
712 {
714 let spdx_matches = spdx_lid_match(&self.index, &query);
715 subtract_spdx_match_qspans(
716 &mut query,
717 &mut matched_qspans,
718 &mut aho_extra_matchables,
719 &spdx_matches,
720 );
721 all_matches.extend(spdx_matches);
722 }
723
724 {
726 let aho_matches = if aho_extra_matchables.is_empty() {
727 aho_match(&self.index, &whole_query_run)
728 } else {
729 aho_match::aho_match_with_extra_matchables(
730 &self.index,
731 &whole_query_run,
732 Some(&aho_extra_matchables),
733 )
734 };
735 let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
736 candidate_contained_matches.extend(refined_aho.clone());
737 let (merged_aho, _) = merge_and_prepare_aho_matches(
738 &self.index,
739 &mut query,
740 &mut matched_qspans,
741 &refined_aho,
742 );
743 all_matches.extend(merged_aho);
744
745 let whole_query_followup = collect_whole_query_exact_followup_matches(
746 &self.index,
747 &mut query,
748 &mut matched_qspans,
749 &whole_query_run,
750 );
751 all_matches.extend(whole_query_followup);
752
753 let merged_seq = collect_regular_seq_matches(
754 &self.index,
755 &query,
756 &matched_qspans,
757 &candidate_contained_matches,
758 );
759 all_matches.extend(merged_seq);
760 }
761
762 let merged_matches =
764 refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
765
766 let refined_matches = if unknown_licenses {
768 let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
769 let unknown_matches = unknown_match(&self.index, &query, &good_matches);
770 let filtered_unknown =
771 filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
772
773 let mut all_matches = good_matches;
774 all_matches.extend(filtered_unknown);
775 all_matches.extend(weak_matches);
776 all_matches
777 } else {
778 merged_matches
779 };
780
781 let refined = refine_matches(&self.index, refined_matches, &query);
783
784 let mut sorted = refined;
785 sort_matches_by_line(&mut sorted);
786
787 Ok(sorted)
789 }
790
791 pub fn index(&self) -> &index::LicenseIndex {
793 &self.index
794 }
795
796 #[cfg(test)]
798 pub fn spdx_mapping(&self) -> &SpdxMapping {
799 &self.spdx_mapping
800 }
801}
802
803#[cfg(test)]
804mod tests;