1pub mod aho_match;
4pub mod automaton;
5pub(crate) mod detection;
6pub mod embedded;
7
8#[cfg(test)]
9mod embedded_test;
10pub mod expression;
11#[cfg(test)]
12mod golden_test;
13pub mod hash_match;
14pub mod index;
15mod match_refine;
16pub mod models;
17pub mod query;
18pub mod rules;
19pub mod seq_match;
20pub mod spans;
21pub mod spdx_lid;
22pub mod spdx_mapping;
23#[cfg(test)]
24mod test_utils;
25pub mod tokenize;
26pub mod unknown_match;
27
28use bit_set::BitSet;
29use std::collections::HashSet;
30use std::path::Path;
31use std::sync::Arc;
32
33use anyhow::Result;
34
35use crate::license_detection::embedded::index::load_license_index_from_bytes;
36use crate::license_detection::index::build_index_from_loaded;
37use crate::license_detection::query::Query;
38use crate::license_detection::rules::{
39 load_loaded_licenses_from_directory, load_loaded_rules_from_directory,
40};
41use crate::license_detection::spdx_mapping::{SpdxMapping, build_spdx_mapping};
42use crate::utils::text::strip_utf8_bom_str;
43
44use crate::license_detection::detection::{
45 attach_source_path_to_detections, empty_detection, populate_detection_from_group_with_spdx,
46};
47use crate::license_detection::models::MatcherKind;
48
49#[allow(dead_code)]
52pub const SCANCODE_LICENSES_RULES_PATH: &str =
53 "reference/scancode-toolkit/src/licensedcode/data/rules";
54
55#[allow(dead_code)]
58pub const SCANCODE_LICENSES_LICENSES_PATH: &str =
59 "reference/scancode-toolkit/src/licensedcode/data/licenses";
60
61#[allow(dead_code)]
64pub const SCANCODE_LICENSES_DATA_PATH: &str = "reference/scancode-toolkit/src/licensedcode/data";
65
66pub(crate) use detection::{
67 LicenseDetection, group_matches_by_region, post_process_detections, sort_matches_by_line,
68};
69pub use models::LicenseMatch;
70
71pub use aho_match::aho_match;
72pub use hash_match::hash_match;
73pub use match_refine::{
74 filter_invalid_contained_unknown_matches, merge_overlapping_matches, refine_matches,
75 refine_matches_without_false_positive_filter, split_weak_matches,
76};
77pub use seq_match::{
78 MAX_NEAR_DUPE_CANDIDATES, compute_candidates_with_msets, seq_match_with_candidates,
79};
80pub use spdx_lid::spdx_lid_match;
81pub use unknown_match::unknown_match;
82
83#[derive(Debug, Clone)]
89pub struct LicenseDetectionEngine {
90 index: Arc<index::LicenseIndex>,
91 spdx_mapping: SpdxMapping,
92}
93
94const MAX_DETECTION_SIZE: usize = 10 * 1024 * 1024; const MAX_REGULAR_SEQ_CANDIDATES: usize = 70;
96const MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP: usize = 8;
97const MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP: usize = 2;
98
99fn truncate_detection_text(clean_text: &str) -> &str {
100 if clean_text.len() <= MAX_DETECTION_SIZE {
101 return clean_text;
102 }
103
104 log::warn!(
105 "Content size {} exceeds limit {}, truncating for detection",
106 clean_text.len(),
107 MAX_DETECTION_SIZE
108 );
109
110 let boundary = clean_text.floor_char_boundary(MAX_DETECTION_SIZE);
111 &clean_text[..boundary]
112}
113
114fn query_span_for_match(m: &LicenseMatch) -> Option<query::PositionSpan> {
115 (m.end_token > m.start_token).then(|| query::PositionSpan::new(m.start_token, m.end_token - 1))
116}
117
118fn has_full_match_coverage(m: &LicenseMatch) -> bool {
119 ((m.match_coverage * 100.0).round() / 100.0) == 100.0
120}
121
122fn is_redundant_same_expression_seq_container(
123 container: &LicenseMatch,
124 candidate_contained_matches: &[LicenseMatch],
125) -> bool {
126 let container_is_redundant_coverage =
127 has_full_match_coverage(container) || container.match_coverage >= 99.0;
128 if container.matcher != MatcherKind::Seq || !container_is_redundant_coverage {
129 return false;
130 }
131
132 let container_qspan_set: BitSet = container.qspan_bitset();
133
134 let mut contained: Vec<(&LicenseMatch, Vec<usize>)> = candidate_contained_matches
135 .iter()
136 .filter_map(|m| {
137 if m.matcher == MatcherKind::Aho
138 && has_full_match_coverage(m)
139 && m.license_expression == container.license_expression
140 && (container.qcontains_with_set(m, &container_qspan_set)
141 || container.qoverlap_with_set(m, &container_qspan_set) > 0)
142 {
143 Some((m, m.qspan()))
144 } else {
145 None
146 }
147 })
148 .collect();
149
150 if contained.len() < 2 {
151 return false;
152 }
153
154 let material_children = contained
155 .iter()
156 .filter(|(m, _)| m.matched_length > 1)
157 .count();
158 if material_children < 2 {
159 return false;
160 }
161
162 contained.sort_by_key(|(m, _)| m.qspan_bounds());
163
164 let mut child_union = BitSet::new();
165 for (_, qspan) in &contained {
166 for &pos in qspan {
167 child_union.insert(pos);
168 }
169 }
170
171 let container_only_positions: BitSet = container_qspan_set.difference(&child_union).collect();
172 let child_only_positions: BitSet = child_union.difference(&container_qspan_set).collect();
173
174 let mut bridge_positions = BitSet::new();
175 for pair in contained.windows(2) {
176 let (_, previous_end) = pair[0].0.qspan_bounds();
177 let (next_start, _) = pair[1].0.qspan_bounds();
178
179 if next_start < previous_end {
180 return false;
181 }
182
183 for pos in previous_end..next_start {
184 bridge_positions.insert(pos);
185 }
186 }
187
188 let container_only_boundary_positions = container_only_positions
189 .difference(&bridge_positions)
190 .count();
191
192 if container_only_positions.count() == 1
193 && container_only_boundary_positions == 0
194 && child_only_positions.is_empty()
195 {
196 return false;
197 }
198
199 if child_only_positions.is_empty()
200 && container_only_positions.count() == container_only_boundary_positions
201 && container_only_boundary_positions <= 3
202 {
203 let earliest_child = contained
204 .iter()
205 .map(|(m, _)| m.qspan_bounds().0)
206 .min()
207 .unwrap_or(usize::MAX);
208 let latest_child = contained
209 .iter()
210 .map(|(m, _)| m.qspan_bounds().1.saturating_sub(1))
211 .max()
212 .unwrap_or(0);
213
214 let is_one_sided_boundary = container_only_positions
215 .iter()
216 .all(|pos| pos < earliest_child)
217 || container_only_positions
218 .iter()
219 .all(|pos| pos > latest_child);
220
221 if is_one_sided_boundary {
222 return false;
223 }
224 }
225
226 let max_container_only_positions =
227 MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * contained.len() + 1;
228 let max_container_boundary_positions =
229 MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * (contained.len() - 1);
230 let max_child_only_positions = MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP + 1;
231
232 container_only_positions.count() <= max_container_only_positions
233 && container_only_boundary_positions <= max_container_boundary_positions
234 && child_only_positions.count() <= max_child_only_positions
235}
236
237fn filter_redundant_same_expression_seq_containers(
238 seq_matches: Vec<LicenseMatch>,
239 candidate_contained_matches: &[LicenseMatch],
240) -> Vec<LicenseMatch> {
241 seq_matches
242 .into_iter()
243 .filter(|m| !is_redundant_same_expression_seq_container(m, candidate_contained_matches))
244 .collect()
245}
246
247fn is_redundant_low_coverage_composite_seq_wrapper(
248 container: &LicenseMatch,
249 candidate_contained_matches: &[LicenseMatch],
250) -> bool {
251 if container.matcher != seq_match::MATCH_SEQ || container.match_coverage >= 30.0 {
252 return false;
253 }
254
255 let container_qspan_set: BitSet = container.qspan_bitset();
256
257 let children: Vec<(&LicenseMatch, Vec<usize>)> = candidate_contained_matches
258 .iter()
259 .filter_map(|m| {
260 if m.matcher == aho_match::MATCH_AHO
261 && has_full_match_coverage(m)
262 && m.license_expression != container.license_expression
263 && (container.qcontains_with_set(m, &container_qspan_set)
264 || container.qoverlap_with_set(m, &container_qspan_set) > 0)
265 {
266 Some((m, m.qspan()))
267 } else {
268 None
269 }
270 })
271 .collect();
272
273 if children.len() < 2 {
274 return false;
275 }
276
277 let unique_expressions: HashSet<&str> = children
278 .iter()
279 .map(|(m, _)| m.license_expression.as_str())
280 .collect();
281 if unique_expressions.len() < 2 {
282 return false;
283 }
284
285 let mut child_union = BitSet::new();
286 for (_, qspan) in &children {
287 for &pos in qspan {
288 child_union.insert(pos);
289 }
290 }
291
292 let container_only_positions: BitSet = container_qspan_set.difference(&child_union).collect();
293 let child_only_positions: BitSet = child_union.difference(&container_qspan_set).collect();
294
295 let mut sorted_children = children;
296 sorted_children.sort_by_key(|(m, _)| m.qspan_bounds());
297
298 let mut bridge_positions = BitSet::new();
299 for pair in sorted_children.windows(2) {
300 let (_, previous_end) = pair[0].0.qspan_bounds();
301 let (next_start, _) = pair[1].0.qspan_bounds();
302 for pos in previous_end..next_start {
303 bridge_positions.insert(pos);
304 }
305 }
306
307 let container_only_boundary_positions = container_only_positions
308 .difference(&bridge_positions)
309 .count();
310
311 child_only_positions.is_empty()
312 && container_only_positions.count() <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
313 && container_only_boundary_positions <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
314}
315
316fn filter_redundant_low_coverage_composite_seq_wrappers(
317 seq_matches: Vec<LicenseMatch>,
318 candidate_contained_matches: &[LicenseMatch],
319) -> Vec<LicenseMatch> {
320 seq_matches
321 .into_iter()
322 .filter(|m| {
323 !is_redundant_low_coverage_composite_seq_wrapper(m, candidate_contained_matches)
324 })
325 .collect()
326}
327
328fn subtract_spdx_match_qspans(
329 query: &mut Query<'_>,
330 matched_qspans: &mut Vec<query::PositionSpan>,
331 aho_extra_matchables: &mut BitSet,
332 spdx_matches: &[LicenseMatch],
333) {
334 for m in spdx_matches {
335 let Some(span) = query_span_for_match(m) else {
336 continue;
337 };
338
339 for pos in span.iter() {
340 aho_extra_matchables.insert(pos);
341 }
342 query.subtract(&span);
343
344 if (m.match_coverage * 100.0).round() / 100.0 == 100.0 {
345 matched_qspans.push(span);
346 }
347 }
348}
349
350fn merge_and_prepare_aho_matches(
351 index: &index::LicenseIndex,
352 query: &mut Query<'_>,
353 matched_qspans: &mut Vec<query::PositionSpan>,
354 refined_aho: &[LicenseMatch],
355) -> (Vec<LicenseMatch>, bool) {
356 let merged_aho = merge_overlapping_matches(refined_aho);
357 let mut saw_long_exact_license_text_match = false;
358
359 for m in &merged_aho {
360 let Some(span) = query_span_for_match(m) else {
361 continue;
362 };
363
364 if has_full_match_coverage(m) {
365 matched_qspans.push(span.clone());
366 }
367
368 if index
369 .rules_by_rid
370 .get(m.rid)
371 .is_some_and(|rule| rule.is_license_text())
372 && m.rule_length > 120
373 && m.match_coverage > 98.0
374 {
375 query.subtract(&span);
376 saw_long_exact_license_text_match = true;
377 }
378 }
379
380 (merged_aho, saw_long_exact_license_text_match)
381}
382
383fn collect_whole_query_exact_followup_matches(
384 index: &index::LicenseIndex,
385 query: &mut Query<'_>,
386 matched_qspans: &mut Vec<query::PositionSpan>,
387 whole_run: &query::QueryRun<'_>,
388) -> Vec<LicenseMatch> {
389 let mut seq_all_matches = Vec::new();
390
391 if whole_run.is_matchable(false, matched_qspans) {
392 let near_dupe_candidates =
393 compute_candidates_with_msets(index, whole_run, true, MAX_NEAR_DUPE_CANDIDATES);
394
395 if !near_dupe_candidates.is_empty() {
396 let near_dupe_matches =
397 seq_match_with_candidates(index, whole_run, &near_dupe_candidates);
398
399 for m in &near_dupe_matches {
400 if m.end_token > m.start_token {
401 let span = query::PositionSpan::new(m.start_token, m.end_token - 1);
402 query.subtract(&span);
403 matched_qspans.push(span);
404 }
405 }
406
407 seq_all_matches.extend(near_dupe_matches);
408 }
409 }
410
411 seq_all_matches
412}
413
414fn collect_regular_seq_matches(
415 index: &index::LicenseIndex,
416 query: &Query<'_>,
417 matched_qspans: &[query::PositionSpan],
418 candidate_contained_matches: &[LicenseMatch],
419) -> Vec<LicenseMatch> {
420 let mut seq_all_matches = Vec::new();
421
422 for query_run in query.query_runs() {
423 if !query_run.is_matchable(false, matched_qspans) {
424 continue;
425 }
426
427 let candidates =
428 compute_candidates_with_msets(index, &query_run, false, MAX_REGULAR_SEQ_CANDIDATES);
429 if !candidates.is_empty() {
430 let matches = seq_match_with_candidates(index, &query_run, &candidates);
431 seq_all_matches.extend(matches);
432 }
433 }
434
435 let merged_seq = merge_overlapping_matches(&seq_all_matches);
436 let filtered_same_expression =
437 filter_redundant_same_expression_seq_containers(merged_seq, candidate_contained_matches);
438 filter_redundant_low_coverage_composite_seq_wrappers(
439 filtered_same_expression,
440 candidate_contained_matches,
441 )
442}
443
444impl LicenseDetectionEngine {
445 fn from_index(index: index::LicenseIndex) -> Result<Self> {
450 let mut license_vec: Vec<_> = index.licenses_by_key.values().cloned().collect();
451 license_vec.sort_by(|a, b| a.key.cmp(&b.key));
452 let spdx_mapping = build_spdx_mapping(&license_vec);
453
454 Ok(Self {
455 index: Arc::new(index),
456 spdx_mapping,
457 })
458 }
459
460 pub fn from_embedded() -> Result<Self> {
469 let artifact_bytes = include_bytes!("../../resources/license_detection/license_index.zst");
470 let index = load_license_index_from_bytes(artifact_bytes)
471 .map_err(|e| anyhow::anyhow!("Failed to load embedded license index: {}", e))?;
472 Self::from_index(index)
473 }
474
475 pub fn from_directory(rules_path: &Path) -> Result<Self> {
483 let (rules_dir, licenses_dir) = if rules_path.ends_with("data") {
484 (rules_path.join("rules"), rules_path.join("licenses"))
485 } else if rules_path.ends_with("rules") {
486 let parent = rules_path.parent().ok_or_else(|| {
487 anyhow::anyhow!("Cannot determine parent directory for rules path")
488 })?;
489 (rules_path.to_path_buf(), parent.join("licenses"))
490 } else {
491 (rules_path.to_path_buf(), rules_path.to_path_buf())
492 };
493
494 let loaded_rules = load_loaded_rules_from_directory(&rules_dir)?;
495 let loaded_licenses = load_loaded_licenses_from_directory(&licenses_dir)?;
496 let index = build_index_from_loaded(loaded_rules, loaded_licenses, false);
497
498 Self::from_index(index)
499 }
500
501 pub fn detect_with_kind(
502 &self,
503 text: &str,
504 unknown_licenses: bool,
505 binary_derived: bool,
506 ) -> Result<Vec<LicenseDetection>> {
507 let clean_text = strip_utf8_bom_str(text);
508
509 let content = truncate_detection_text(clean_text);
510
511 let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
512 let whole_query_run = query.whole_query_run();
513
514 let mut all_matches = Vec::new();
515 let mut candidate_contained_matches = Vec::new();
516 let mut aho_extra_matchables = BitSet::new();
517 let mut matched_qspans: Vec<query::PositionSpan> = Vec::new();
518
519 {
522 let hash_matches = hash_match(&self.index, &whole_query_run);
523
524 if !hash_matches.is_empty() {
525 let mut matches = hash_matches;
526 sort_matches_by_line(&mut matches);
527
528 let groups = group_matches_by_region(&matches);
529 let detections: Vec<LicenseDetection> = groups
530 .iter()
531 .map(|group| {
532 let mut detection = empty_detection();
533 populate_detection_from_group_with_spdx(
534 &mut detection,
535 group,
536 &self.spdx_mapping,
537 );
538 detection
539 })
540 .collect();
541
542 return Ok(post_process_detections(detections, 0.0));
543 }
544 }
545
546 {
548 let spdx_matches = spdx_lid_match(&self.index, &query);
549 let merged_spdx = merge_overlapping_matches(&spdx_matches);
550 subtract_spdx_match_qspans(
551 &mut query,
552 &mut matched_qspans,
553 &mut aho_extra_matchables,
554 &merged_spdx,
555 );
556 all_matches.extend(merged_spdx);
557 }
558
559 {
561 let aho_matches = if aho_extra_matchables.is_empty() {
562 aho_match(&self.index, &whole_query_run)
563 } else {
564 aho_match::aho_match_with_extra_matchables(
565 &self.index,
566 &whole_query_run,
567 Some(&aho_extra_matchables),
568 )
569 };
570
571 let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
574 candidate_contained_matches.extend(refined_aho.clone());
575 let (merged_aho, _) = merge_and_prepare_aho_matches(
576 &self.index,
577 &mut query,
578 &mut matched_qspans,
579 &refined_aho,
580 );
581 all_matches.extend(merged_aho);
582
583 let whole_query_followup = collect_whole_query_exact_followup_matches(
584 &self.index,
585 &mut query,
586 &mut matched_qspans,
587 &whole_query_run,
588 );
589 all_matches.extend(whole_query_followup);
590
591 let merged_seq = collect_regular_seq_matches(
592 &self.index,
593 &query,
594 &matched_qspans,
595 &candidate_contained_matches,
596 );
597 all_matches.extend(merged_seq);
598 }
599
600 let merged_matches =
603 refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
604
605 let refined_matches = if unknown_licenses {
608 let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
610
611 let unknown_matches = unknown_match(&self.index, &query, &good_matches);
613 let filtered_unknown =
614 filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
615
616 let mut all_matches = good_matches;
617 all_matches.extend(filtered_unknown);
618 all_matches.extend(weak_matches);
621 all_matches
622 } else {
623 merged_matches
624 };
625
626 let refined = refine_matches(&self.index, refined_matches, &query);
628
629 let mut sorted = refined;
630 sort_matches_by_line(&mut sorted);
631
632 let groups = group_matches_by_region(&sorted);
633
634 let detections: Vec<LicenseDetection> = groups
635 .iter()
636 .map(|group| {
637 let mut detection = empty_detection();
638 populate_detection_from_group_with_spdx(&mut detection, group, &self.spdx_mapping);
639 detection
640 })
641 .collect();
642
643 let detections = post_process_detections(detections, 0.0);
644
645 Ok(detections)
646 }
647
648 pub fn detect_with_kind_and_source(
649 &self,
650 text: &str,
651 unknown_licenses: bool,
652 binary_derived: bool,
653 source_path: &str,
654 ) -> Result<Vec<LicenseDetection>> {
655 let mut detections = self.detect_with_kind(text, unknown_licenses, binary_derived)?;
656 attach_source_path_to_detections(&mut detections, source_path);
657 Ok(detections)
658 }
659
660 #[cfg(test)]
664 pub fn detect_matches_with_kind(
665 &self,
666 text: &str,
667 unknown_licenses: bool,
668 binary_derived: bool,
669 ) -> Result<Vec<LicenseMatch>> {
670 let clean_text = strip_utf8_bom_str(text);
671
672 let content = truncate_detection_text(clean_text);
673
674 let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
675 let whole_query_run = query.whole_query_run();
676
677 let mut all_matches = Vec::new();
678 let mut candidate_contained_matches = Vec::new();
679 let mut aho_extra_matchables = BitSet::new();
680 let mut matched_qspans: Vec<query::PositionSpan> = Vec::new();
681
682 {
684 let hash_matches = hash_match(&self.index, &whole_query_run);
685
686 if !hash_matches.is_empty() {
687 let mut matches = hash_matches;
688 sort_matches_by_line(&mut matches);
689 return Ok(matches);
690 }
691 }
692
693 {
695 let spdx_matches = spdx_lid_match(&self.index, &query);
696 let merged_spdx = merge_overlapping_matches(&spdx_matches);
697 subtract_spdx_match_qspans(
698 &mut query,
699 &mut matched_qspans,
700 &mut aho_extra_matchables,
701 &merged_spdx,
702 );
703 all_matches.extend(merged_spdx);
704 }
705
706 {
708 let aho_matches = if aho_extra_matchables.is_empty() {
709 aho_match(&self.index, &whole_query_run)
710 } else {
711 aho_match::aho_match_with_extra_matchables(
712 &self.index,
713 &whole_query_run,
714 Some(&aho_extra_matchables),
715 )
716 };
717 let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
718 candidate_contained_matches.extend(refined_aho.clone());
719 let (merged_aho, _) = merge_and_prepare_aho_matches(
720 &self.index,
721 &mut query,
722 &mut matched_qspans,
723 &refined_aho,
724 );
725 all_matches.extend(merged_aho);
726
727 let whole_query_followup = collect_whole_query_exact_followup_matches(
728 &self.index,
729 &mut query,
730 &mut matched_qspans,
731 &whole_query_run,
732 );
733 all_matches.extend(whole_query_followup);
734
735 let merged_seq = collect_regular_seq_matches(
736 &self.index,
737 &query,
738 &matched_qspans,
739 &candidate_contained_matches,
740 );
741 all_matches.extend(merged_seq);
742 }
743
744 let merged_matches =
746 refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
747
748 let refined_matches = if unknown_licenses {
750 let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
751 let unknown_matches = unknown_match(&self.index, &query, &good_matches);
752 let filtered_unknown =
753 filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
754
755 let mut all_matches = good_matches;
756 all_matches.extend(filtered_unknown);
757 all_matches.extend(weak_matches);
758 all_matches
759 } else {
760 merged_matches
761 };
762
763 let refined = refine_matches(&self.index, refined_matches, &query);
765
766 let mut sorted = refined;
767 sort_matches_by_line(&mut sorted);
768
769 Ok(sorted)
771 }
772
773 pub fn index(&self) -> &index::LicenseIndex {
775 &self.index
776 }
777
778 #[cfg(test)]
780 pub fn spdx_mapping(&self) -> &SpdxMapping {
781 &self.spdx_mapping
782 }
783}
784
785#[cfg(test)]
786mod tests;