1use crate::license_detection::automaton::Automaton;
4use once_cell::sync::Lazy;
5use regex::Regex;
6use sha1::{Digest, Sha1};
7
8use crate::license_detection::index::LicenseIndex;
9use crate::license_detection::index::dictionary::{TokenId, TokenKind};
10use crate::license_detection::models::position_span::PositionSpan;
11use crate::license_detection::models::{LicenseMatch, MatchCoordinates, MatcherKind};
12use crate::license_detection::position_set::PositionSet;
13use crate::license_detection::query::Query;
14use crate::license_detection::tokenize::STOPWORDS;
15
16pub const MATCH_UNKNOWN: MatcherKind = MatcherKind::Unknown;
17
18const UNKNOWN_NGRAM_LENGTH: usize = 6;
19
20const MIN_NGRAM_MATCHES: usize = 3;
21
22const MIN_REGION_LENGTH: usize = 5;
23
24static QUERY_PATTERN: Lazy<Regex> =
25 Lazy::new(|| Regex::new(r"[^_\W]+\+?[^_\W]*").expect("Invalid regex pattern"));
26static MATCHED_TEXT_PATTERN: Lazy<Regex> = Lazy::new(|| {
27 Regex::new(r"(?P<token>[^_\W]+\+?[^_\W]*)|(?P<punct>[_\W\s\+]+[_\W\s]?)")
28 .expect("Invalid matched text regex pattern")
29});
30
31#[derive(Clone)]
32struct MatchedTextToken {
33 value: String,
34 line_num: usize,
35 pos: Option<usize>,
36 is_text: bool,
37 is_known: bool,
38 is_matched: bool,
39}
40
41pub fn unknown_match(
42 index: &LicenseIndex,
43 query: &Query,
44 known_matches: &[LicenseMatch],
45) -> Vec<LicenseMatch> {
46 let mut unknown_matches = Vec::new();
47
48 if query.tokens.is_empty() {
49 return unknown_matches;
50 }
51
52 let query_len = query.tokens.len();
53
54 let covered_positions = compute_covered_positions(query, known_matches);
55
56 let unmatched_regions = find_unmatched_regions(query_len, &covered_positions);
57
58 let automaton = &index.unknown_automaton;
59
60 for region in unmatched_regions {
61 let start = region.0;
62 let end = region.1;
63
64 let region_length = end - start;
65 if region_length < MIN_REGION_LENGTH {
66 continue;
67 }
68
69 let matched_ngrams = get_matched_ngrams(&query.tokens, start, end, automaton);
70
71 if matched_ngrams.len() < MIN_NGRAM_MATCHES {
72 continue;
73 }
74
75 let qspan = compute_qspan_union(&matched_ngrams);
76
77 if qspan.is_empty() {
78 continue;
79 }
80
81 let qspan_length = qspan.len();
82
83 #[cfg(debug_assertions)]
85 {
86 eprintln!("\n=== UNKNOWN MATCH DEBUG ===");
87 eprintln!("Region: {}-{} ({} tokens)", start, end, region_length);
88 eprintln!("matched_ngrams: {} matches", matched_ngrams.len());
89 eprintln!("qspan: {:?}", qspan);
90 eprintln!(
91 "qspan_length: {} (threshold: {})",
92 qspan_length,
93 UNKNOWN_NGRAM_LENGTH * 4
94 );
95 }
96
97 if qspan_length < UNKNOWN_NGRAM_LENGTH * 4 {
98 continue;
99 }
100
101 let hispan = compute_hispan_from_qspan(&query.tokens, &qspan, index);
102
103 #[cfg(debug_assertions)]
104 {
105 eprintln!("hispan: {} (threshold: 5)", hispan);
106 }
107
108 if hispan < 5 {
109 continue;
110 }
111
112 if let Some(match_result) = create_unknown_match_from_qspan(query, &qspan) {
113 unknown_matches.push(match_result);
114 }
115 }
116
117 unknown_matches
118}
119
120fn compute_covered_positions(_query: &Query, known_matches: &[LicenseMatch]) -> PositionSet {
121 let mut covered = PositionSet::new();
122 for m in known_matches {
123 covered.extend_from_span(m.query_span());
124 }
125 covered
126}
127
128fn find_unmatched_regions(
129 query_len: usize,
130 covered_positions: &PositionSet,
131) -> Vec<(usize, usize)> {
132 let mut regions = Vec::new();
133
134 if query_len == 0 {
135 return regions;
136 }
137
138 let mut region_start = None;
139
140 for pos in 0..query_len {
141 if !covered_positions.contains(pos) {
142 if region_start.is_none() {
143 region_start = Some(pos);
144 }
145 } else if let Some(start) = region_start {
146 regions.push((start, pos));
147 region_start = None;
148 }
149 }
150
151 if let Some(start) = region_start {
152 regions.push((start, query_len));
153 }
154
155 regions
156}
157
158fn get_matched_ngrams(
159 tokens: &[TokenId],
160 start: usize,
161 end: usize,
162 automaton: &Automaton,
163) -> Vec<(usize, usize)> {
164 if start >= end || end > tokens.len() {
165 return Vec::new();
166 }
167
168 let region_tokens = &tokens[start..end];
169
170 let region_bytes: Vec<u8> = region_tokens
171 .iter()
172 .flat_map(|tid| tid.to_le_bytes())
173 .collect();
174
175 let offset = UNKNOWN_NGRAM_LENGTH;
176 let mut matches = Vec::new();
177
178 for m in automaton.find_overlapping_iter(®ion_bytes) {
179 let local_qend = m.end / 2;
180 let qend = start + local_qend;
181 let qstart = qend.saturating_sub(offset);
182 matches.push((qstart, qend));
183 }
184
185 matches
186}
187
188fn compute_qspan_union(positions: &[(usize, usize)]) -> PositionSet {
189 if positions.is_empty() {
190 return PositionSet::new();
191 }
192
193 let mut sorted: Vec<_> = positions.to_vec();
194 sorted.sort_by_key(|p| p.0);
195
196 let mut merged: Vec<(usize, usize)> = Vec::new();
197 let mut current = sorted[0];
198
199 for (start, end) in sorted.into_iter().skip(1) {
200 if start <= current.1 {
201 current.1 = current.1.max(end);
202 } else {
203 merged.push(current);
204 current = (start, end);
205 }
206 }
207 merged.push(current);
208
209 let mut result = PositionSet::new();
210 for (start, end) in merged {
211 result.extend_from_span(&PositionSpan::range(start, end));
212 }
213 result
214}
215
216fn compute_hispan_from_qspan(
217 tokens: &[TokenId],
218 qspan: &PositionSet,
219 index: &LicenseIndex,
220) -> usize {
221 qspan
222 .iter()
223 .filter(|&pos| {
224 tokens
225 .get(pos)
226 .is_some_and(|&tid| index.dictionary.token_kind(tid) == TokenKind::Legalese)
227 })
228 .count()
229}
230
231fn create_unknown_match_from_qspan(query: &Query, qspan: &PositionSet) -> Option<LicenseMatch> {
232 if qspan.is_empty() {
233 return None;
234 }
235
236 let match_len = qspan.len();
237
238 let start = qspan.min_pos();
239 let end = qspan.max_pos() + 1;
240
241 let start_line = query.line_by_pos.get(start).copied().unwrap_or(1);
242 let end_line = query
243 .line_by_pos
244 .get(end.saturating_sub(1))
245 .copied()
246 .unwrap_or(start_line);
247
248 let qspan_positions: Vec<usize> = qspan.iter().collect();
249 let synthetic_rule_text =
250 build_unknown_rule_text(query, &qspan_positions, start_line, end_line);
251 let rule_identifier = build_unknown_rule_identifier(&synthetic_rule_text);
252
253 let ngram_count = qspan.len();
254
255 let score = calculate_score(ngram_count, match_len);
256
257 let qspan_span = qspan.to_position_span();
258
259 LicenseMatch {
260 rid: 0,
261 license_expression: "unknown".to_string(),
262 license_expression_spdx: None,
263 from_file: None,
264 start_line,
265 end_line,
266 start_token: start,
267 end_token: end,
268 matcher: MATCH_UNKNOWN,
269 score,
270 matched_length: match_len,
271 rule_length: match_len,
272 match_coverage: 100.0,
273 rule_relevance: 50,
274 rule_identifier,
275 rule_url: String::new(),
276 matched_text: None,
277 referenced_filenames: None,
278 rule_kind: crate::license_detection::models::RuleKind::None,
279 is_from_license: false,
280 rule_start_token: 0,
281 coordinates: MatchCoordinates::query_region(qspan_span),
282 candidate_resemblance: 0.0,
283 candidate_containment: 0.0,
284 }
285 .into()
286}
287
288fn build_unknown_rule_text(
289 query: &Query,
290 qspan_positions: &[usize],
291 start_line: usize,
292 end_line: usize,
293) -> String {
294 let Some(&start_pos) = qspan_positions.first() else {
295 return String::new();
296 };
297 let Some(&end_pos) = qspan_positions.last() else {
298 return String::new();
299 };
300
301 let matched_positions: PositionSet = qspan_positions.iter().copied().collect();
302 let tokens = tokenize_matched_unknown_text(&query.text, query);
303 let reportable_tokens = collect_reportable_unknown_tokens(
304 tokens,
305 &matched_positions,
306 start_pos,
307 end_pos,
308 start_line,
309 end_line,
310 );
311 let line_endings = collect_line_endings(&query.text);
312
313 render_unknown_rule_tokens(&reportable_tokens, &line_endings)
314}
315
316fn tokenize_matched_unknown_text(text: &str, query: &Query) -> Vec<MatchedTextToken> {
317 let mut tokens = Vec::new();
318 let mut pos = 0usize;
319 let mut line_num = 1usize;
320
321 for line in text.split_inclusive('\n') {
322 for capture in MATCHED_TEXT_PATTERN.captures_iter(line) {
323 if let Some(token_match) = capture.name("token") {
324 let token_text = token_match.as_str();
325 let retokenized: Vec<String> = QUERY_PATTERN
326 .find_iter(&token_text.to_lowercase())
327 .map(|m| m.as_str().to_string())
328 .filter(|token| !STOPWORDS.contains(token.as_str()))
329 .collect();
330
331 if retokenized.is_empty() {
332 tokens.push(MatchedTextToken {
333 value: token_text.to_string(),
334 line_num,
335 pos: None,
336 is_text: true,
337 is_known: false,
338 is_matched: false,
339 });
340 } else if retokenized.len() == 1 {
341 let token = &retokenized[0];
342 let is_known = query.index.dictionary.get(token).is_some();
343 let token_pos = if is_known {
344 let current_pos = pos;
345 pos += 1;
346 Some(current_pos)
347 } else {
348 None
349 };
350
351 tokens.push(MatchedTextToken {
352 value: token_text.to_string(),
353 line_num,
354 pos: token_pos,
355 is_text: true,
356 is_known,
357 is_matched: false,
358 });
359 } else {
360 for token in retokenized {
361 let is_known = query.index.dictionary.get(&token).is_some();
362 let token_pos = if is_known {
363 let current_pos = pos;
364 pos += 1;
365 Some(current_pos)
366 } else {
367 None
368 };
369
370 tokens.push(MatchedTextToken {
371 value: token,
372 line_num,
373 pos: token_pos,
374 is_text: true,
375 is_known,
376 is_matched: false,
377 });
378 }
379 }
380 } else if let Some(punct_match) = capture.name("punct") {
381 tokens.push(MatchedTextToken {
382 value: punct_match.as_str().to_string(),
383 line_num,
384 pos: None,
385 is_text: false,
386 is_known: false,
387 is_matched: false,
388 });
389 }
390 }
391
392 line_num += 1;
393 }
394
395 tokens
396}
397
398fn collect_reportable_unknown_tokens(
399 tokens: Vec<MatchedTextToken>,
400 matched_positions: &PositionSet,
401 start_pos: usize,
402 end_pos: usize,
403 start_line: usize,
404 end_line: usize,
405) -> Vec<MatchedTextToken> {
406 let mut reportable = Vec::new();
407 let mut started = false;
408 let mut finished = false;
409 let mut end_real_pos = None;
410 let mut last_real_pos = None;
411
412 for (real_pos, mut token) in tokens.into_iter().enumerate() {
413 if token.line_num < start_line {
414 continue;
415 }
416
417 if token.line_num > end_line {
418 break;
419 }
420
421 let mut is_included = false;
422
423 if token
424 .pos
425 .is_some_and(|pos| token.is_known && matched_positions.contains(pos))
426 {
427 token.is_matched = true;
428 is_included = true;
429 }
430
431 if !started && token.pos == Some(start_pos) {
432 started = true;
433 is_included = true;
434 }
435
436 if started && !finished {
437 is_included = true;
438 }
439
440 if token.pos == Some(end_pos) {
441 finished = true;
442 started = false;
443 end_real_pos = Some(real_pos);
444 }
445
446 if finished && !started && end_real_pos.is_some() && last_real_pos == end_real_pos {
447 end_real_pos = None;
448 if !token.is_text && !token.value.trim().is_empty() {
449 is_included = true;
450 }
451 }
452
453 last_real_pos = Some(real_pos);
454
455 if is_included {
456 reportable.push(token);
457 }
458 }
459
460 reportable
461}
462
463fn collect_line_endings(text: &str) -> Vec<String> {
464 text.split_inclusive('\n')
465 .map(|line| {
466 if line.ends_with("\r\n") {
467 "\r\n".to_string()
468 } else if line.ends_with('\n') {
469 "\n".to_string()
470 } else {
471 String::new()
472 }
473 })
474 .collect()
475}
476
477fn render_unknown_rule_tokens(tokens: &[MatchedTextToken], line_endings: &[String]) -> String {
478 let mut rendered = String::new();
479 let mut previous_line: Option<usize> = None;
480
481 for token in tokens {
482 if let Some(prev_line) = previous_line
483 && token.line_num > prev_line
484 {
485 for line in prev_line..token.line_num {
486 if let Some(line_ending) = line_endings.get(line.saturating_sub(1)) {
487 rendered.push_str(line_ending.as_str());
488 }
489 }
490 }
491
492 let token_value = if token.is_text {
493 token.value.as_str()
494 } else {
495 token
496 .value
497 .strip_suffix("\r\n")
498 .or_else(|| token.value.strip_suffix('\n'))
499 .unwrap_or(token.value.as_str())
500 };
501
502 if token.is_text && !STOPWORDS.contains(token.value.to_lowercase().as_str()) {
503 if token.is_matched {
504 rendered.push_str(token_value);
505 } else {
506 rendered.push('.');
507 }
508 } else {
509 rendered.push_str(token_value);
510 }
511
512 previous_line = Some(token.line_num);
513 }
514
515 rendered
516}
517
518fn build_unknown_rule_identifier(rule_text: &str) -> String {
519 let content = format!("None{}", python_str_repr(rule_text));
520 let mut hasher = Sha1::new();
521 hasher.update(content.as_bytes());
522 let digest = hasher.finalize();
523
524 format!("license-detection-unknown-{}", hex::encode(digest))
525}
526
527fn python_str_repr(text: &str) -> String {
528 let use_double_quotes = text.contains('\'') && !text.contains('"');
529 let quote = if use_double_quotes { '"' } else { '\'' };
530 let mut escaped = String::with_capacity(text.len());
531
532 for ch in text.chars() {
533 match ch {
534 '\\' => escaped.push_str("\\\\"),
535 '\n' => escaped.push_str("\\n"),
536 '\r' => escaped.push_str("\\r"),
537 '\t' => escaped.push_str("\\t"),
538 '\'' if !use_double_quotes => escaped.push_str("\\'"),
539 '"' if use_double_quotes => escaped.push_str("\\\""),
540 _ => escaped.push(ch),
541 }
542 }
543
544 format!("{quote}{escaped}{quote}")
545}
546
547fn calculate_score(ngram_count: usize, match_len: usize) -> f32 {
548 if match_len == 0 {
549 return 0.0;
550 }
551
552 let density = ngram_count as f32 / match_len as f32;
553
554 density.min(1.0)
555}
556
557#[cfg(test)]
558mod tests {
559 use super::*;
560 use crate::license_detection::index::LicenseIndex;
561 use crate::license_detection::index::dictionary::{TokenId, tid};
562 use crate::license_detection::query::Query;
563
564 fn tids(values: &[u16]) -> Vec<TokenId> {
565 values.iter().copied().map(tid).collect()
566 }
567
568 #[test]
569 fn test_unknown_match_empty_query() {
570 let index = LicenseIndex::with_legalese_count(10);
571 let query = Query::from_extracted_text("", &index, false).expect("Failed to create query");
572 let known_matches = vec![];
573
574 let matches = unknown_match(&index, &query, &known_matches);
575
576 assert!(matches.is_empty());
577 }
578
579 #[test]
580 fn test_find_unmatched_regions_no_coverage() {
581 let query_len = 10;
582 let covered_positions = PositionSet::new();
583
584 let regions = find_unmatched_regions(query_len, &covered_positions);
585
586 assert_eq!(regions, vec![(0, 10)]);
587 }
588
589 #[test]
590 fn test_find_unmatched_regions_full_coverage() {
591 let query_len = 10;
592 let covered_positions: PositionSet = (0..10).collect();
593
594 let regions = find_unmatched_regions(query_len, &covered_positions);
595
596 assert!(regions.is_empty());
597 }
598
599 #[test]
600 fn test_find_unmatched_regions_partial_coverage() {
601 let query_len = 20;
602 let covered_positions: PositionSet = [0, 1, 2, 12, 13, 14, 15, 16, 17, 18, 19]
603 .iter()
604 .cloned()
605 .collect();
606
607 let regions = find_unmatched_regions(query_len, &covered_positions);
608
609 assert_eq!(regions.len(), 1);
610 assert_eq!(regions[0], (3, 12));
611 }
612
613 #[test]
614 fn test_find_unmatched_regions_trailing_unmatched() {
615 let query_len = 20;
616 let covered_positions: PositionSet = [0, 1, 2, 3, 4, 5].iter().cloned().collect();
617
618 let regions = find_unmatched_regions(query_len, &covered_positions);
619
620 assert_eq!(regions.len(), 1);
621 assert_eq!(regions[0], (6, 20));
622 }
623
624 #[test]
625 fn test_compute_qspan_union_empty() {
626 let positions: Vec<(usize, usize)> = Vec::new();
627 let merged = compute_qspan_union(&positions);
628 assert!(merged.is_empty());
629 }
630
631 #[test]
632 fn test_compute_qspan_union_single() {
633 let positions = vec![(5, 11)];
634 let merged = compute_qspan_union(&positions);
635 assert_eq!(merged.len(), 6);
636 assert!(merged.contains(5));
637 assert!(merged.contains(10));
638 assert!(!merged.contains(4));
639 assert!(!merged.contains(11));
640 }
641
642 #[test]
643 fn test_compute_qspan_union_overlapping() {
644 let positions = vec![(5, 11), (8, 14), (20, 26)];
645 let merged = compute_qspan_union(&positions);
646 assert_eq!(merged.len(), 15);
647 assert!(merged.contains(5));
648 assert!(merged.contains(13));
649 assert!(!merged.contains(14));
650 assert!(merged.contains(20));
651 assert!(merged.contains(25));
652 assert!(!merged.contains(26));
653 }
654
655 #[test]
656 fn test_compute_qspan_union_adjacent() {
657 let positions = vec![(5, 11), (11, 17)];
658 let merged = compute_qspan_union(&positions);
659 assert_eq!(merged.len(), 12);
660 assert!(merged.contains(5));
661 assert!(merged.contains(16));
662 assert!(!merged.contains(4));
663 assert!(!merged.contains(17));
664 }
665
666 #[test]
667 fn test_compute_qspan_union_unsorted() {
668 let positions = vec![(20, 26), (5, 11), (8, 14)];
669 let merged = compute_qspan_union(&positions);
670 assert_eq!(merged.len(), 15);
671 assert!(merged.contains(5));
672 assert!(merged.contains(13));
673 assert!(merged.contains(20));
674 assert!(merged.contains(25));
675 }
676
677 #[test]
678 fn test_compute_hispan_from_qspan() {
679 let mut index = LicenseIndex::with_legalese_count(0);
680 let legalese_entries: Vec<(String, u16)> = (0..15)
681 .map(|i| (format!("legalese-{i}"), i as u16))
682 .collect();
683 index.dictionary =
684 crate::license_detection::index::dictionary::TokenDictionary::new_with_legalese(
685 &legalese_entries
686 .iter()
687 .map(|(token, id)| (token.as_str(), *id))
688 .collect::<Vec<_>>(),
689 );
690
691 let mut tokens: Vec<TokenId> = (0..15)
692 .map(|i| {
693 index
694 .dictionary
695 .get_token_id(&format!("legalese-{i}"))
696 .unwrap()
697 })
698 .collect();
699 for i in 15..30 {
700 tokens.push(index.dictionary.get_or_assign(&format!("regular-{i}")));
701 }
702 let qspan: PositionSet = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24]
703 .iter()
704 .copied()
705 .collect();
706 let hispan = compute_hispan_from_qspan(&tokens, &qspan, &index);
707 assert_eq!(hispan, 10);
708 }
709
710 #[test]
711 fn test_get_matched_ngrams_empty_automaton() {
712 use crate::license_detection::automaton::AutomatonBuilder;
713
714 let tokens = tids(&[1, 2, 3, 4, 5, 6, 7, 8]);
715 let automaton = AutomatonBuilder::new().build();
716
717 let matches = get_matched_ngrams(&tokens, 0, 8, &automaton);
718
719 assert!(matches.is_empty());
720 }
721
722 #[test]
723 fn test_get_matched_ngrams_with_matches() {
724 use crate::license_detection::automaton::AutomatonBuilder;
725
726 let tokens: Vec<TokenId> = (0..30).map(tid).collect();
727 let ngram: Vec<u8> = vec![0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0];
728
729 let mut builder = AutomatonBuilder::new();
730 builder.add_pattern(&ngram);
731 let automaton = builder.build();
732
733 let matches = get_matched_ngrams(&tokens, 0, 30, &automaton);
734
735 assert!(!matches.is_empty(), "Should find ngram matches");
736
737 for (qstart, qend) in &matches {
738 assert_eq!(*qend - *qstart, UNKNOWN_NGRAM_LENGTH);
739 }
740 }
741
742 #[test]
743 fn test_get_matched_ngrams_keeps_overlapping_matches() {
744 use crate::license_detection::automaton::AutomatonBuilder;
745
746 let tokens = tids(&[1, 2, 3, 1, 2, 3, 1, 2, 3]);
747 let overlapping_ngram: Vec<u8> = tokens[..UNKNOWN_NGRAM_LENGTH]
748 .iter()
749 .flat_map(|tid| tid.to_le_bytes())
750 .collect();
751
752 let mut builder = AutomatonBuilder::new();
753 builder.add_pattern(&overlapping_ngram);
754 let automaton = builder.build();
755
756 let matches = get_matched_ngrams(&tokens, 0, tokens.len(), &automaton);
757
758 assert_eq!(matches, vec![(0, 6), (3, 9)]);
759 }
760
761 #[test]
762 fn test_calculate_score() {
763 let score1 = calculate_score(5, 10);
764 let score2 = calculate_score(10, 10);
765 let score3 = calculate_score(0, 10);
766
767 assert!(score2 > score1);
768 assert!(score2 <= 1.0);
769 assert_eq!(score3, 0.0);
770 }
771
772 #[test]
773 fn test_find_unmatched_regions_leading_unmatched() {
774 let query_len = 20;
775 let covered_positions: PositionSet = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
776 .iter()
777 .cloned()
778 .collect();
779
780 let regions = find_unmatched_regions(query_len, &covered_positions);
781
782 assert_eq!(regions.len(), 1);
783 assert_eq!(regions[0], (0, 10));
784 }
785
786 #[test]
787 fn test_find_unmatched_regions_middle_gap() {
788 let query_len = 30;
789 let covered_positions: PositionSet =
790 [0, 1, 2, 3, 4, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
791 .iter()
792 .cloned()
793 .collect();
794
795 let regions = find_unmatched_regions(query_len, &covered_positions);
796
797 assert_eq!(regions.len(), 1);
798 assert_eq!(regions[0], (5, 20));
799 }
800
801 #[test]
802 fn test_compute_covered_positions_gapped_qspan() {
803 let index = LicenseIndex::with_legalese_count(10);
804 let query = Query::from_extracted_text("some license text here", &index, false)
805 .expect("Failed to create query");
806
807 let known_matches = vec![LicenseMatch {
808 rid: 0,
809 license_expression: "test".to_string(),
810 license_expression_spdx: Some("TEST".to_string()),
811 from_file: None,
812 start_line: 1,
813 end_line: 1,
814 start_token: 0,
815 end_token: 10,
816 matcher: MatcherKind::Aho,
817 score: 1.0,
818 matched_length: 6,
819 rule_length: 6,
820 match_coverage: 100.0,
821 rule_relevance: 100,
822 rule_identifier: "test-rule".to_string(),
823 rule_url: String::new(),
824 matched_text: Some("matched text".to_string()),
825 referenced_filenames: None,
826 rule_kind: crate::license_detection::models::RuleKind::None,
827 is_from_license: false,
828 rule_start_token: 0,
829 coordinates: MatchCoordinates::query_region(PositionSpan::from_positions(vec![
830 0, 1, 2, 7, 8, 9,
831 ])),
832 candidate_resemblance: 0.0,
833 candidate_containment: 0.0,
834 }];
835
836 let covered = compute_covered_positions(&query, &known_matches);
837
838 assert!(covered.contains(0), "Should contain position 0");
839 assert!(covered.contains(2), "Should contain position 2");
840 assert!(covered.contains(7), "Should contain position 7");
841 assert!(covered.contains(9), "Should contain position 9");
842 assert!(!covered.contains(3), "Should NOT contain position 3 (gap)");
843 assert!(!covered.contains(5), "Should NOT contain position 5 (gap)");
844 assert!(
845 !covered.contains(10),
846 "Should NOT contain position 10 (outside)"
847 );
848 }
849
850 #[test]
851 fn test_compute_covered_positions_fallback_contiguous() {
852 let index = LicenseIndex::with_legalese_count(10);
853 let query = Query::from_extracted_text("some license text here", &index, false)
854 .expect("Failed to create query");
855
856 let known_matches = vec![LicenseMatch {
857 rid: 0,
858 license_expression: "test".to_string(),
859 license_expression_spdx: Some("TEST".to_string()),
860 from_file: None,
861 start_line: 1,
862 end_line: 1,
863 start_token: 5,
864 end_token: 10,
865 matcher: MatcherKind::Aho,
866 score: 1.0,
867 matched_length: 5,
868 rule_length: 5,
869 match_coverage: 100.0,
870 rule_relevance: 100,
871 rule_identifier: "test-rule".to_string(),
872 rule_url: String::new(),
873 matched_text: Some("matched text".to_string()),
874 referenced_filenames: None,
875 rule_kind: crate::license_detection::models::RuleKind::None,
876 is_from_license: false,
877 rule_start_token: 0,
878 coordinates: MatchCoordinates::query_region(PositionSpan::range(5, 10)),
879 candidate_resemblance: 0.0,
880 candidate_containment: 0.0,
881 }];
882
883 let covered = compute_covered_positions(&query, &known_matches);
884
885 assert!(covered.contains(5), "Should contain position 5");
886 assert!(covered.contains(7), "Should contain position 7");
887 assert!(covered.contains(9), "Should contain position 9");
888 assert!(
889 !covered.contains(4),
890 "Should NOT contain position 4 (before)"
891 );
892 assert!(
893 !covered.contains(10),
894 "Should NOT contain position 10 (after)"
895 );
896 }
897
898 #[test]
899 fn test_compute_covered_positions_qspan_creates_extra_unmatched_region() {
900 let index = LicenseIndex::with_legalese_count(10);
901 let query = Query::from_extracted_text("some license text here", &index, false)
902 .expect("Failed to create query");
903
904 let known_matches = vec![LicenseMatch {
905 rid: 0,
906 license_expression: "test".to_string(),
907 license_expression_spdx: Some("TEST".to_string()),
908 from_file: None,
909 start_line: 1,
910 end_line: 1,
911 start_token: 0,
912 end_token: 15,
913 matcher: MatcherKind::Aho,
914 score: 1.0,
915 matched_length: 8,
916 rule_length: 8,
917 match_coverage: 100.0,
918 rule_relevance: 100,
919 rule_identifier: "test-rule".to_string(),
920 rule_url: String::new(),
921 matched_text: Some("matched text".to_string()),
922 referenced_filenames: None,
923 rule_kind: crate::license_detection::models::RuleKind::None,
924 is_from_license: false,
925 rule_start_token: 0,
926 coordinates: MatchCoordinates::query_region(PositionSpan::from_positions(vec![
927 0, 1, 2, 3, 11, 12, 13, 14,
928 ])),
929 candidate_resemblance: 0.0,
930 candidate_containment: 0.0,
931 }];
932
933 let covered = compute_covered_positions(&query, &known_matches);
934 let regions = find_unmatched_regions(20, &covered);
935
936 assert!(
937 regions.contains(&(4, 11)),
938 "Should have unmatched region 4-11 (the gap in qspan_positions), got: {:?}",
939 regions
940 );
941 assert!(
942 regions.contains(&(15, 20)),
943 "Should have trailing unmatched region 15-20, got: {:?}",
944 regions
945 );
946
947 let contiguous_covered: PositionSet = (0..15).collect();
948 let contiguous_regions = find_unmatched_regions(20, &contiguous_covered);
949 assert_eq!(
950 contiguous_regions,
951 vec![(15, 20)],
952 "Contiguous coverage would collapse the gap, producing only trailing region"
953 );
954 }
955
956 #[test]
957 fn test_create_unknown_match_from_qspan_valid() {
958 use crate::license_detection::test_utils::create_mock_query_with_tokens;
959
960 let index = LicenseIndex::with_legalese_count(10);
961
962 let tokens: Vec<u16> = (0..30).collect();
963 let query = create_mock_query_with_tokens(&tokens, &index);
964
965 let qspan: PositionSet = (0..30).collect();
966
967 let match_result = create_unknown_match_from_qspan(&query, &qspan);
968
969 assert!(
970 match_result.is_some(),
971 "Should create unknown match for sufficient length"
972 );
973
974 let m = match_result.unwrap();
975 assert_eq!(m.license_expression, "unknown");
976 assert_eq!(m.matcher, MATCH_UNKNOWN);
977 assert!(!m.coordinates.query_span().is_empty());
978 }
979
980 #[test]
981 fn test_unknown_match_with_known_matches() {
982 let index = LicenseIndex::with_legalese_count(10);
983 let text = "some text that is license related and should be detected";
984 let query =
985 Query::from_extracted_text(text, &index, false).expect("Failed to create query");
986
987 let known_matches = vec![LicenseMatch {
988 rid: 0,
989 license_expression: "mit".to_string(),
990 license_expression_spdx: Some("MIT".to_string()),
991 from_file: None,
992 start_line: 1,
993 end_line: 1,
994 start_token: 0,
995 end_token: 5,
996 matcher: MatcherKind::Aho,
997 score: 1.0,
998 matched_length: 5,
999 rule_length: 5,
1000 match_coverage: 100.0,
1001 rule_relevance: 100,
1002 rule_identifier: "test-rule".to_string(),
1003 rule_url: String::new(),
1004 matched_text: Some("some text".to_string()),
1005 referenced_filenames: None,
1006 rule_kind: crate::license_detection::models::RuleKind::None,
1007 is_from_license: false,
1008 rule_start_token: 0,
1009 coordinates: MatchCoordinates::query_region(PositionSpan::range(0, 5)),
1010 candidate_resemblance: 0.0,
1011 candidate_containment: 0.0,
1012 }];
1013
1014 let matches = unknown_match(&index, &query, &known_matches);
1015
1016 assert!(
1017 matches.is_empty() || matches[0].start_line > 1,
1018 "Should not re-detect known regions"
1019 );
1020 }
1021
1022 #[test]
1023 fn test_calculate_score_edge_cases() {
1024 let score_zero_length = calculate_score(10, 0);
1025 assert_eq!(score_zero_length, 0.0, "Zero length should have zero score");
1026
1027 let score_zero_ngrams = calculate_score(0, 100);
1028 assert_eq!(score_zero_ngrams, 0.0, "Zero ngrams should have zero score");
1029
1030 let score_high_density = calculate_score(100, 50);
1031 assert_eq!(
1032 score_high_density, 1.0,
1033 "High density should be capped at 1.0"
1034 );
1035 }
1036
1037 #[test]
1038 fn test_get_matched_ngrams_out_of_bounds() {
1039 use crate::license_detection::automaton::AutomatonBuilder;
1040
1041 let tokens = tids(&[1, 2, 3]);
1042 let automaton = AutomatonBuilder::new().build();
1043
1044 let matches = get_matched_ngrams(&tokens, 5, 10, &automaton);
1045 assert!(matches.is_empty(), "Out of bounds should return empty");
1046
1047 let matches = get_matched_ngrams(&tokens, 2, 1, &automaton);
1048 assert!(matches.is_empty(), "Invalid range should return empty");
1049 }
1050}