1pub mod aho_match;
4mod detection;
5pub mod embedded;
6
7#[cfg(test)]
8mod embedded_test;
9pub mod expression;
10#[cfg(test)]
11mod golden_test;
12pub mod hash_match;
13pub mod index;
14mod match_refine;
15pub mod models;
16pub mod query;
17pub mod rules;
18pub mod seq_match;
19pub mod spans;
20pub mod spdx_lid;
21pub mod spdx_mapping;
22#[cfg(test)]
23mod test_utils;
24pub mod tokenize;
25pub mod unknown_match;
26
27use bit_set::BitSet;
28use std::collections::HashSet;
29use std::path::Path;
30use std::sync::Arc;
31
32use anyhow::Result;
33
34use crate::license_detection::index::build_index_from_loaded;
35use crate::license_detection::query::Query;
36use crate::license_detection::rules::{
37 load_loaded_licenses_from_directory, load_loaded_rules_from_directory,
38};
39use crate::license_detection::spdx_mapping::{SpdxMapping, build_spdx_mapping};
40use crate::utils::text::strip_utf8_bom_str;
41
42use crate::license_detection::detection::populate_detection_from_group_with_spdx;
43use crate::license_detection::models::MatcherKind;
44
45#[allow(dead_code)]
48pub const SCANCODE_LICENSES_RULES_PATH: &str =
49 "reference/scancode-toolkit/src/licensedcode/data/rules";
50
51#[allow(dead_code)]
54pub const SCANCODE_LICENSES_LICENSES_PATH: &str =
55 "reference/scancode-toolkit/src/licensedcode/data/licenses";
56
57#[allow(dead_code)]
60pub const SCANCODE_LICENSES_DATA_PATH: &str = "reference/scancode-toolkit/src/licensedcode/data";
61
62pub use detection::{
63 LicenseDetection, create_detection_from_group, group_matches_by_region,
64 post_process_detections, sort_matches_by_line,
65};
66pub use models::LicenseMatch;
67
68pub use aho_match::aho_match;
69pub use hash_match::hash_match;
70pub use match_refine::{
71 filter_invalid_contained_unknown_matches, merge_overlapping_matches, refine_matches,
72 refine_matches_without_false_positive_filter, split_weak_matches,
73};
74pub use seq_match::{
75 MAX_NEAR_DUPE_CANDIDATES, compute_candidates_with_msets, seq_match_with_candidates,
76};
77pub use spdx_lid::spdx_lid_match;
78pub use unknown_match::unknown_match;
79
80#[derive(Debug, Clone)]
86pub struct LicenseDetectionEngine {
87 index: Arc<index::LicenseIndex>,
88 spdx_mapping: SpdxMapping,
89}
90
91const MAX_DETECTION_SIZE: usize = 10 * 1024 * 1024; const MAX_REGULAR_SEQ_CANDIDATES: usize = 70;
93const MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP: usize = 8;
94const MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP: usize = 2;
95
96fn query_span_for_match(m: &LicenseMatch) -> Option<query::PositionSpan> {
97 (m.end_token > m.start_token).then(|| query::PositionSpan::new(m.start_token, m.end_token - 1))
98}
99
100fn has_full_match_coverage(m: &LicenseMatch) -> bool {
101 ((m.match_coverage * 100.0).round() / 100.0) == 100.0
102}
103
104fn is_redundant_same_expression_seq_container(
105 container: &LicenseMatch,
106 candidate_contained_matches: &[LicenseMatch],
107) -> bool {
108 let container_is_redundant_coverage =
109 has_full_match_coverage(container) || container.match_coverage >= 99.0;
110 if container.matcher != MatcherKind::Seq || !container_is_redundant_coverage {
111 return false;
112 }
113
114 let container_qspan_set: BitSet = container.qspan_bitset();
115
116 let mut contained: Vec<(&LicenseMatch, Vec<usize>)> = candidate_contained_matches
117 .iter()
118 .filter_map(|m| {
119 if m.matcher == MatcherKind::Aho
120 && has_full_match_coverage(m)
121 && m.license_expression == container.license_expression
122 && (container.qcontains_with_set(m, &container_qspan_set)
123 || container.qoverlap_with_set(m, &container_qspan_set) > 0)
124 {
125 Some((m, m.qspan()))
126 } else {
127 None
128 }
129 })
130 .collect();
131
132 if contained.len() < 2 {
133 return false;
134 }
135
136 let material_children = contained
137 .iter()
138 .filter(|(m, _)| m.matched_length > 1)
139 .count();
140 if material_children < 2 {
141 return false;
142 }
143
144 contained.sort_by_key(|(m, _)| m.qspan_bounds());
145
146 let mut child_union = BitSet::new();
147 for (_, qspan) in &contained {
148 for &pos in qspan {
149 child_union.insert(pos);
150 }
151 }
152
153 let container_only_positions: BitSet = container_qspan_set.difference(&child_union).collect();
154 let child_only_positions: BitSet = child_union.difference(&container_qspan_set).collect();
155
156 let mut bridge_positions = BitSet::new();
157 for pair in contained.windows(2) {
158 let (_, previous_end) = pair[0].0.qspan_bounds();
159 let (next_start, _) = pair[1].0.qspan_bounds();
160
161 if next_start < previous_end {
162 return false;
163 }
164
165 for pos in previous_end..next_start {
166 bridge_positions.insert(pos);
167 }
168 }
169
170 let container_only_boundary_positions = container_only_positions
171 .difference(&bridge_positions)
172 .count();
173
174 if container_only_positions.count() == 1
175 && container_only_boundary_positions == 0
176 && child_only_positions.is_empty()
177 {
178 return false;
179 }
180
181 if child_only_positions.is_empty()
182 && container_only_positions.count() == container_only_boundary_positions
183 && container_only_boundary_positions <= 3
184 {
185 let earliest_child = contained
186 .iter()
187 .map(|(m, _)| m.qspan_bounds().0)
188 .min()
189 .unwrap_or(usize::MAX);
190 let latest_child = contained
191 .iter()
192 .map(|(m, _)| m.qspan_bounds().1.saturating_sub(1))
193 .max()
194 .unwrap_or(0);
195
196 let is_one_sided_boundary = container_only_positions
197 .iter()
198 .all(|pos| pos < earliest_child)
199 || container_only_positions
200 .iter()
201 .all(|pos| pos > latest_child);
202
203 if is_one_sided_boundary {
204 return false;
205 }
206 }
207
208 let max_container_only_positions =
209 MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * contained.len() + 1;
210 let max_container_boundary_positions =
211 MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP * (contained.len() - 1);
212 let max_child_only_positions = MAX_REDUNDANT_SEQ_CONTAINER_UNMATCHED_GAP + 1;
213
214 container_only_positions.count() <= max_container_only_positions
215 && container_only_boundary_positions <= max_container_boundary_positions
216 && child_only_positions.count() <= max_child_only_positions
217}
218
219fn filter_redundant_same_expression_seq_containers(
220 seq_matches: Vec<LicenseMatch>,
221 candidate_contained_matches: &[LicenseMatch],
222) -> Vec<LicenseMatch> {
223 seq_matches
224 .into_iter()
225 .filter(|m| !is_redundant_same_expression_seq_container(m, candidate_contained_matches))
226 .collect()
227}
228
229fn is_redundant_low_coverage_composite_seq_wrapper(
230 container: &LicenseMatch,
231 candidate_contained_matches: &[LicenseMatch],
232) -> bool {
233 if container.matcher != seq_match::MATCH_SEQ || container.match_coverage >= 30.0 {
234 return false;
235 }
236
237 let container_qspan_set: BitSet = container.qspan_bitset();
238
239 let children: Vec<(&LicenseMatch, Vec<usize>)> = candidate_contained_matches
240 .iter()
241 .filter_map(|m| {
242 if m.matcher == aho_match::MATCH_AHO
243 && has_full_match_coverage(m)
244 && m.license_expression != container.license_expression
245 && (container.qcontains_with_set(m, &container_qspan_set)
246 || container.qoverlap_with_set(m, &container_qspan_set) > 0)
247 {
248 Some((m, m.qspan()))
249 } else {
250 None
251 }
252 })
253 .collect();
254
255 if children.len() < 2 {
256 return false;
257 }
258
259 let unique_expressions: HashSet<&str> = children
260 .iter()
261 .map(|(m, _)| m.license_expression.as_str())
262 .collect();
263 if unique_expressions.len() < 2 {
264 return false;
265 }
266
267 let mut child_union = BitSet::new();
268 for (_, qspan) in &children {
269 for &pos in qspan {
270 child_union.insert(pos);
271 }
272 }
273
274 let container_only_positions: BitSet = container_qspan_set.difference(&child_union).collect();
275 let child_only_positions: BitSet = child_union.difference(&container_qspan_set).collect();
276
277 let mut sorted_children = children;
278 sorted_children.sort_by_key(|(m, _)| m.qspan_bounds());
279
280 let mut bridge_positions = BitSet::new();
281 for pair in sorted_children.windows(2) {
282 let (_, previous_end) = pair[0].0.qspan_bounds();
283 let (next_start, _) = pair[1].0.qspan_bounds();
284 for pos in previous_end..next_start {
285 bridge_positions.insert(pos);
286 }
287 }
288
289 let container_only_boundary_positions = container_only_positions
290 .difference(&bridge_positions)
291 .count();
292
293 child_only_positions.is_empty()
294 && container_only_positions.count() <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
295 && container_only_boundary_positions <= MAX_REDUNDANT_SEQ_CONTAINER_BOUNDARY_GAP
296}
297
298fn filter_redundant_low_coverage_composite_seq_wrappers(
299 seq_matches: Vec<LicenseMatch>,
300 candidate_contained_matches: &[LicenseMatch],
301) -> Vec<LicenseMatch> {
302 seq_matches
303 .into_iter()
304 .filter(|m| {
305 !is_redundant_low_coverage_composite_seq_wrapper(m, candidate_contained_matches)
306 })
307 .collect()
308}
309
310fn subtract_spdx_match_qspans(
311 query: &mut Query<'_>,
312 matched_qspans: &mut Vec<query::PositionSpan>,
313 aho_extra_matchables: &mut BitSet,
314 spdx_matches: &[LicenseMatch],
315) {
316 for m in spdx_matches {
317 let Some(span) = query_span_for_match(m) else {
318 continue;
319 };
320
321 for pos in span.iter() {
322 aho_extra_matchables.insert(pos);
323 }
324 query.subtract(&span);
325
326 if (m.match_coverage * 100.0).round() / 100.0 == 100.0 {
327 matched_qspans.push(span);
328 }
329 }
330}
331
332fn merge_and_prepare_aho_matches(
333 index: &index::LicenseIndex,
334 query: &mut Query<'_>,
335 matched_qspans: &mut Vec<query::PositionSpan>,
336 refined_aho: &[LicenseMatch],
337) -> (Vec<LicenseMatch>, bool) {
338 let merged_aho = merge_overlapping_matches(refined_aho);
339 let mut saw_long_exact_license_text_match = false;
340
341 for m in &merged_aho {
342 let Some(span) = query_span_for_match(m) else {
343 continue;
344 };
345
346 if has_full_match_coverage(m) {
347 matched_qspans.push(span.clone());
348 }
349
350 if index
351 .rules_by_rid
352 .get(m.rid)
353 .is_some_and(|rule| rule.is_license_text())
354 && m.rule_length > 120
355 && m.match_coverage > 98.0
356 {
357 query.subtract(&span);
358 saw_long_exact_license_text_match = true;
359 }
360 }
361
362 (merged_aho, saw_long_exact_license_text_match)
363}
364
365fn collect_whole_query_exact_followup_matches(
366 index: &index::LicenseIndex,
367 query: &mut Query<'_>,
368 matched_qspans: &mut Vec<query::PositionSpan>,
369 whole_run: &query::QueryRun<'_>,
370) -> Vec<LicenseMatch> {
371 let mut seq_all_matches = Vec::new();
372
373 if whole_run.is_matchable(false, matched_qspans) {
374 let near_dupe_candidates =
375 compute_candidates_with_msets(index, whole_run, true, MAX_NEAR_DUPE_CANDIDATES);
376
377 if !near_dupe_candidates.is_empty() {
378 let near_dupe_matches =
379 seq_match_with_candidates(index, whole_run, &near_dupe_candidates);
380
381 for m in &near_dupe_matches {
382 if m.end_token > m.start_token {
383 let span = query::PositionSpan::new(m.start_token, m.end_token - 1);
384 query.subtract(&span);
385 matched_qspans.push(span);
386 }
387 }
388
389 seq_all_matches.extend(near_dupe_matches);
390 }
391 }
392
393 seq_all_matches
394}
395
396fn collect_regular_seq_matches(
397 index: &index::LicenseIndex,
398 query: &Query<'_>,
399 matched_qspans: &[query::PositionSpan],
400 candidate_contained_matches: &[LicenseMatch],
401) -> Vec<LicenseMatch> {
402 let mut seq_all_matches = Vec::new();
403
404 for query_run in query.query_runs() {
405 if !query_run.is_matchable(false, matched_qspans) {
406 continue;
407 }
408
409 let candidates =
410 compute_candidates_with_msets(index, &query_run, false, MAX_REGULAR_SEQ_CANDIDATES);
411 if !candidates.is_empty() {
412 let matches = seq_match_with_candidates(index, &query_run, &candidates);
413 seq_all_matches.extend(matches);
414 }
415 }
416
417 let merged_seq = merge_overlapping_matches(&seq_all_matches);
418 let filtered_same_expression =
419 filter_redundant_same_expression_seq_containers(merged_seq, candidate_contained_matches);
420 filter_redundant_low_coverage_composite_seq_wrappers(
421 filtered_same_expression,
422 candidate_contained_matches,
423 )
424}
425
426impl LicenseDetectionEngine {
427 fn from_index(index: index::LicenseIndex) -> Result<Self> {
432 let mut license_vec: Vec<_> = index.licenses_by_key.values().cloned().collect();
433 license_vec.sort_by(|a, b| a.key.cmp(&b.key));
434 let spdx_mapping = build_spdx_mapping(&license_vec);
435
436 Ok(Self {
437 index: Arc::new(index),
438 spdx_mapping,
439 })
440 }
441
442 pub fn from_embedded() -> Result<Self> {
451 let artifact_bytes =
452 include_bytes!("../../resources/license_detection/license_index_loader.msgpack.zst");
453 let decompressed = zstd::decode_all(&artifact_bytes[..])
454 .map_err(|e| anyhow::anyhow!("Failed to decompress embedded artifact: {}", e))?;
455 let snapshot: embedded::schema::EmbeddedLoaderSnapshot =
456 rmp_serde::from_slice(&decompressed)
457 .map_err(|e| anyhow::anyhow!("Failed to deserialize embedded artifact: {}", e))?;
458
459 if snapshot.schema_version != embedded::schema::SCHEMA_VERSION {
460 anyhow::bail!(
461 "Embedded artifact schema version mismatch: expected {}, got {}",
462 embedded::schema::SCHEMA_VERSION,
463 snapshot.schema_version
464 );
465 }
466
467 let index = build_index_from_loaded(snapshot.rules, snapshot.licenses, false);
468 Self::from_index(index)
469 }
470
471 pub fn from_directory(rules_path: &Path) -> Result<Self> {
479 let (rules_dir, licenses_dir) = if rules_path.ends_with("data") {
480 (rules_path.join("rules"), rules_path.join("licenses"))
481 } else if rules_path.ends_with("rules") {
482 let parent = rules_path.parent().ok_or_else(|| {
483 anyhow::anyhow!("Cannot determine parent directory for rules path")
484 })?;
485 (rules_path.to_path_buf(), parent.join("licenses"))
486 } else {
487 (rules_path.to_path_buf(), rules_path.to_path_buf())
488 };
489
490 let loaded_rules = load_loaded_rules_from_directory(&rules_dir)?;
491 let loaded_licenses = load_loaded_licenses_from_directory(&licenses_dir)?;
492 let index = build_index_from_loaded(loaded_rules, loaded_licenses, false);
493
494 Self::from_index(index)
495 }
496
497 pub fn detect_with_kind(
498 &self,
499 text: &str,
500 unknown_licenses: bool,
501 binary_derived: bool,
502 ) -> Result<Vec<LicenseDetection>> {
503 let clean_text = strip_utf8_bom_str(text);
504
505 let content = if clean_text.len() > MAX_DETECTION_SIZE {
506 log::warn!(
507 "Content size {} exceeds limit {}, truncating for detection",
508 clean_text.len(),
509 MAX_DETECTION_SIZE
510 );
511 &clean_text[..MAX_DETECTION_SIZE]
512 } else {
513 clean_text
514 };
515
516 let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
517 let whole_query_run = query.whole_query_run();
518
519 let mut all_matches = Vec::new();
520 let mut candidate_contained_matches = Vec::new();
521 let mut aho_extra_matchables = BitSet::new();
522 let mut matched_qspans: Vec<query::PositionSpan> = Vec::new();
523
524 {
527 let hash_matches = hash_match(&self.index, &whole_query_run);
528
529 if !hash_matches.is_empty() {
530 let mut matches = hash_matches;
531 sort_matches_by_line(&mut matches);
532
533 let groups = group_matches_by_region(&matches);
534 let detections: Vec<LicenseDetection> = groups
535 .iter()
536 .map(|group| {
537 let mut detection = create_detection_from_group(group);
538 populate_detection_from_group_with_spdx(
539 &mut detection,
540 group,
541 &self.spdx_mapping,
542 );
543 detection
544 })
545 .collect();
546
547 return Ok(post_process_detections(detections, 0.0));
548 }
549 }
550
551 {
553 let spdx_matches = spdx_lid_match(&self.index, &query);
554 let merged_spdx = merge_overlapping_matches(&spdx_matches);
555 subtract_spdx_match_qspans(
556 &mut query,
557 &mut matched_qspans,
558 &mut aho_extra_matchables,
559 &merged_spdx,
560 );
561 all_matches.extend(merged_spdx);
562 }
563
564 {
566 let aho_matches = if aho_extra_matchables.is_empty() {
567 aho_match(&self.index, &whole_query_run)
568 } else {
569 aho_match::aho_match_with_extra_matchables(
570 &self.index,
571 &whole_query_run,
572 Some(&aho_extra_matchables),
573 )
574 };
575
576 let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
579 candidate_contained_matches.extend(refined_aho.clone());
580 let (merged_aho, _) = merge_and_prepare_aho_matches(
581 &self.index,
582 &mut query,
583 &mut matched_qspans,
584 &refined_aho,
585 );
586 all_matches.extend(merged_aho);
587
588 let whole_query_followup = collect_whole_query_exact_followup_matches(
589 &self.index,
590 &mut query,
591 &mut matched_qspans,
592 &whole_query_run,
593 );
594 all_matches.extend(whole_query_followup);
595
596 let merged_seq = collect_regular_seq_matches(
597 &self.index,
598 &query,
599 &matched_qspans,
600 &candidate_contained_matches,
601 );
602 all_matches.extend(merged_seq);
603 }
604
605 let merged_matches =
608 refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
609
610 let refined_matches = if unknown_licenses {
613 let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
615
616 let unknown_matches = unknown_match(&self.index, &query, &good_matches);
618 let filtered_unknown =
619 filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
620
621 let mut all_matches = good_matches;
622 all_matches.extend(filtered_unknown);
623 all_matches.extend(weak_matches);
626 all_matches
627 } else {
628 merged_matches
629 };
630
631 let refined = refine_matches(&self.index, refined_matches, &query);
633
634 let mut sorted = refined;
635 sort_matches_by_line(&mut sorted);
636
637 let groups = group_matches_by_region(&sorted);
638
639 let detections: Vec<LicenseDetection> = groups
640 .iter()
641 .map(|group| {
642 let mut detection = create_detection_from_group(group);
643 populate_detection_from_group_with_spdx(&mut detection, group, &self.spdx_mapping);
644 detection
645 })
646 .collect();
647
648 let detections = post_process_detections(detections, 0.0);
649
650 Ok(detections)
651 }
652
653 #[cfg(test)]
657 pub fn detect_matches_with_kind(
658 &self,
659 text: &str,
660 unknown_licenses: bool,
661 binary_derived: bool,
662 ) -> Result<Vec<LicenseMatch>> {
663 let clean_text = strip_utf8_bom_str(text);
664
665 let content = if clean_text.len() > MAX_DETECTION_SIZE {
666 log::warn!(
667 "Content size {} exceeds limit {}, truncating for detection",
668 clean_text.len(),
669 MAX_DETECTION_SIZE
670 );
671 &clean_text[..MAX_DETECTION_SIZE]
672 } else {
673 clean_text
674 };
675
676 let mut query = Query::from_extracted_text(content, &self.index, binary_derived)?;
677 let whole_query_run = query.whole_query_run();
678
679 let mut all_matches = Vec::new();
680 let mut candidate_contained_matches = Vec::new();
681 let mut aho_extra_matchables = BitSet::new();
682 let mut matched_qspans: Vec<query::PositionSpan> = Vec::new();
683
684 {
686 let hash_matches = hash_match(&self.index, &whole_query_run);
687
688 if !hash_matches.is_empty() {
689 let mut matches = hash_matches;
690 sort_matches_by_line(&mut matches);
691 return Ok(matches);
692 }
693 }
694
695 {
697 let spdx_matches = spdx_lid_match(&self.index, &query);
698 let merged_spdx = merge_overlapping_matches(&spdx_matches);
699 subtract_spdx_match_qspans(
700 &mut query,
701 &mut matched_qspans,
702 &mut aho_extra_matchables,
703 &merged_spdx,
704 );
705 all_matches.extend(merged_spdx);
706 }
707
708 {
710 let aho_matches = if aho_extra_matchables.is_empty() {
711 aho_match(&self.index, &whole_query_run)
712 } else {
713 aho_match::aho_match_with_extra_matchables(
714 &self.index,
715 &whole_query_run,
716 Some(&aho_extra_matchables),
717 )
718 };
719 let refined_aho = match_refine::refine_aho_matches(&self.index, aho_matches, &query);
720 candidate_contained_matches.extend(refined_aho.clone());
721 let (merged_aho, _) = merge_and_prepare_aho_matches(
722 &self.index,
723 &mut query,
724 &mut matched_qspans,
725 &refined_aho,
726 );
727 all_matches.extend(merged_aho);
728
729 let whole_query_followup = collect_whole_query_exact_followup_matches(
730 &self.index,
731 &mut query,
732 &mut matched_qspans,
733 &whole_query_run,
734 );
735 all_matches.extend(whole_query_followup);
736
737 let merged_seq = collect_regular_seq_matches(
738 &self.index,
739 &query,
740 &matched_qspans,
741 &candidate_contained_matches,
742 );
743 all_matches.extend(merged_seq);
744 }
745
746 let merged_matches =
748 refine_matches_without_false_positive_filter(&self.index, all_matches, &query);
749
750 let refined_matches = if unknown_licenses {
752 let (good_matches, weak_matches) = split_weak_matches(&self.index, &merged_matches);
753 let unknown_matches = unknown_match(&self.index, &query, &good_matches);
754 let filtered_unknown =
755 filter_invalid_contained_unknown_matches(&unknown_matches, &good_matches);
756
757 let mut all_matches = good_matches;
758 all_matches.extend(filtered_unknown);
759 all_matches.extend(weak_matches);
760 all_matches
761 } else {
762 merged_matches
763 };
764
765 let refined = refine_matches(&self.index, refined_matches, &query);
767
768 let mut sorted = refined;
769 sort_matches_by_line(&mut sorted);
770
771 Ok(sorted)
773 }
774
775 pub fn index(&self) -> &index::LicenseIndex {
777 &self.index
778 }
779
780 #[cfg(test)]
782 pub fn spdx_mapping(&self) -> &SpdxMapping {
783 &self.spdx_mapping
784 }
785}
786
787#[cfg(test)]
788mod tests;