1use crate::license_detection::automaton::Automaton;
7use regex::Regex;
8use sha1::{Digest, Sha1};
9use std::sync::LazyLock;
10
11use crate::license_detection::index::LicenseIndex;
12use crate::license_detection::index::dictionary::{TokenId, TokenKind};
13use crate::license_detection::models::position_span::PositionSpan;
14use crate::license_detection::models::{LicenseMatch, MatchCoordinates, MatcherKind};
15use crate::license_detection::position_set::PositionSet;
16use crate::license_detection::query::Query;
17use crate::license_detection::tokenize::STOPWORDS;
18use crate::models::LineNumber;
19use crate::models::MatchScore;
20
21pub const MATCH_UNKNOWN: MatcherKind = MatcherKind::Unknown;
22
23const UNKNOWN_NGRAM_LENGTH: usize = 6;
24
25const MIN_NGRAM_MATCHES: usize = 3;
26
27const MIN_REGION_LENGTH: usize = 5;
28
29static QUERY_PATTERN: LazyLock<Regex> =
30 LazyLock::new(|| Regex::new(r"[^_\W]+\+?[^_\W]*").expect("Invalid regex pattern"));
31static MATCHED_TEXT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
32 Regex::new(r"(?P<token>[^_\W]+\+?[^_\W]*)|(?P<punct>[_\W\s\+]+[_\W\s]?)")
33 .expect("Invalid matched text regex pattern")
34});
35
36#[derive(Clone)]
37struct MatchedTextToken {
38 value: String,
39 line_num: usize,
40 pos: Option<usize>,
41 is_text: bool,
42 is_known: bool,
43 is_matched: bool,
44}
45
46pub fn unknown_match(
47 index: &LicenseIndex,
48 query: &Query,
49 known_matches: &[LicenseMatch],
50) -> Vec<LicenseMatch> {
51 let mut unknown_matches = Vec::new();
52
53 if query.tokens.is_empty() {
54 return unknown_matches;
55 }
56
57 let query_len = query.tokens.len();
58
59 let covered_positions = compute_covered_positions(query, known_matches);
60
61 let unmatched_regions = find_unmatched_regions(query_len, &covered_positions);
62
63 let automaton = &index.unknown_automaton;
64
65 for region in unmatched_regions {
66 let start = region.0;
67 let end = region.1;
68
69 let region_length = end - start;
70 if region_length < MIN_REGION_LENGTH {
71 continue;
72 }
73
74 let matched_ngrams = get_matched_ngrams(&query.tokens, start, end, automaton);
75
76 if matched_ngrams.len() < MIN_NGRAM_MATCHES {
77 continue;
78 }
79
80 let qspan = compute_qspan_union(&matched_ngrams);
81
82 if qspan.is_empty() {
83 continue;
84 }
85
86 let qspan_length = qspan.len();
87
88 #[cfg(debug_assertions)]
90 {
91 eprintln!("\n=== UNKNOWN MATCH DEBUG ===");
92 eprintln!("Region: {}-{} ({} tokens)", start, end, region_length);
93 eprintln!("matched_ngrams: {} matches", matched_ngrams.len());
94 eprintln!("qspan: {:?}", qspan);
95 eprintln!(
96 "qspan_length: {} (threshold: {})",
97 qspan_length,
98 UNKNOWN_NGRAM_LENGTH * 4
99 );
100 }
101
102 if qspan_length < UNKNOWN_NGRAM_LENGTH * 4 {
103 continue;
104 }
105
106 let hispan = compute_hispan_from_qspan(&query.tokens, &qspan, index);
107
108 #[cfg(debug_assertions)]
109 {
110 eprintln!("hispan: {} (threshold: 5)", hispan);
111 }
112
113 if hispan < 5 {
114 continue;
115 }
116
117 if let Some(match_result) = create_unknown_match_from_qspan(query, &qspan) {
118 unknown_matches.push(match_result);
119 }
120 }
121
122 unknown_matches
123}
124
125fn compute_covered_positions(_query: &Query, known_matches: &[LicenseMatch]) -> PositionSet {
126 let mut covered = PositionSet::new();
127 for m in known_matches {
128 covered.extend_from_span(m.query_span());
129 }
130 covered
131}
132
133fn find_unmatched_regions(
134 query_len: usize,
135 covered_positions: &PositionSet,
136) -> Vec<(usize, usize)> {
137 let mut regions = Vec::new();
138
139 if query_len == 0 {
140 return regions;
141 }
142
143 let mut region_start = None;
144
145 for pos in 0..query_len {
146 if !covered_positions.contains(pos) {
147 if region_start.is_none() {
148 region_start = Some(pos);
149 }
150 } else if let Some(start) = region_start {
151 regions.push((start, pos));
152 region_start = None;
153 }
154 }
155
156 if let Some(start) = region_start {
157 regions.push((start, query_len));
158 }
159
160 regions
161}
162
163fn get_matched_ngrams(
164 tokens: &[TokenId],
165 start: usize,
166 end: usize,
167 automaton: &Automaton,
168) -> Vec<(usize, usize)> {
169 if start >= end || end > tokens.len() {
170 return Vec::new();
171 }
172
173 let region_tokens = &tokens[start..end];
174
175 let region_bytes: Vec<u8> = region_tokens
176 .iter()
177 .flat_map(|tid| tid.to_le_bytes())
178 .collect();
179
180 let offset = UNKNOWN_NGRAM_LENGTH;
181 let mut matches = Vec::new();
182
183 for m in automaton.find_overlapping_iter(®ion_bytes) {
184 let local_qend = m.end / 2;
185 let qend = start + local_qend;
186 let qstart = qend.saturating_sub(offset);
187 matches.push((qstart, qend));
188 }
189
190 matches
191}
192
193fn compute_qspan_union(positions: &[(usize, usize)]) -> PositionSet {
194 if positions.is_empty() {
195 return PositionSet::new();
196 }
197
198 let mut sorted: Vec<_> = positions.to_vec();
199 sorted.sort_by_key(|p| p.0);
200
201 let mut merged: Vec<(usize, usize)> = Vec::new();
202 let mut current = sorted[0];
203
204 for (start, end) in sorted.into_iter().skip(1) {
205 if start <= current.1 {
206 current.1 = current.1.max(end);
207 } else {
208 merged.push(current);
209 current = (start, end);
210 }
211 }
212 merged.push(current);
213
214 let mut result = PositionSet::new();
215 for (start, end) in merged {
216 result.extend_from_span(&PositionSpan::range(start, end));
217 }
218 result
219}
220
221fn compute_hispan_from_qspan(
222 tokens: &[TokenId],
223 qspan: &PositionSet,
224 index: &LicenseIndex,
225) -> usize {
226 qspan
227 .iter()
228 .filter(|&pos| {
229 tokens
230 .get(pos)
231 .is_some_and(|&tid| index.dictionary.token_kind(tid) == TokenKind::Legalese)
232 })
233 .count()
234}
235
236fn create_unknown_match_from_qspan(query: &Query, qspan: &PositionSet) -> Option<LicenseMatch> {
237 if qspan.is_empty() {
238 return None;
239 }
240
241 let match_len = qspan.len();
242
243 let start = qspan.min_pos();
244 let end = qspan.max_pos() + 1;
245
246 let start_line = query
247 .line_by_pos
248 .get(start)
249 .copied()
250 .and_then(LineNumber::new)
251 .unwrap_or(LineNumber::ONE);
252 let end_line = query
253 .line_by_pos
254 .get(end.saturating_sub(1))
255 .copied()
256 .and_then(LineNumber::new)
257 .unwrap_or(start_line);
258
259 let qspan_positions: Vec<usize> = qspan.iter().collect();
260 let synthetic_rule_text =
261 build_unknown_rule_text(query, &qspan_positions, start_line, end_line);
262 let rule_identifier = build_unknown_rule_identifier(&synthetic_rule_text);
263
264 let ngram_count = qspan.len();
265
266 let score = calculate_score(ngram_count, match_len);
267
268 let qspan_span = qspan.to_position_span();
269
270 LicenseMatch {
271 rid: 0,
272 license_expression: "unknown".to_string(),
273 license_expression_spdx: None,
274 from_file: None,
275 start_line,
276 end_line,
277 start_token: start,
278 end_token: end,
279 matcher: MATCH_UNKNOWN,
280 score,
281 matched_length: match_len,
282 rule_length: match_len,
283 match_coverage: 100.0,
284 rule_relevance: 50,
285 rule_identifier,
286 rule_url: String::new(),
287 matched_text: None,
288 referenced_filenames: None,
289 rule_kind: crate::license_detection::models::RuleKind::None,
290 is_from_license: false,
291 rule_start_token: 0,
292 coordinates: MatchCoordinates::query_region(qspan_span),
293 candidate_resemblance: 0.0,
294 candidate_containment: 0.0,
295 }
296 .into()
297}
298
299fn build_unknown_rule_text(
300 query: &Query,
301 qspan_positions: &[usize],
302 start_line: LineNumber,
303 end_line: LineNumber,
304) -> String {
305 let Some(&start_pos) = qspan_positions.first() else {
306 return String::new();
307 };
308 let Some(&end_pos) = qspan_positions.last() else {
309 return String::new();
310 };
311
312 let matched_positions: PositionSet = qspan_positions.iter().copied().collect();
313 let tokens = tokenize_matched_unknown_text(&query.text, query);
314 let reportable_tokens = collect_reportable_unknown_tokens(
315 tokens,
316 &matched_positions,
317 start_pos,
318 end_pos,
319 start_line.get(),
320 end_line.get(),
321 );
322 let line_endings = collect_line_endings(&query.text);
323
324 render_unknown_rule_tokens(&reportable_tokens, &line_endings)
325}
326
327fn tokenize_matched_unknown_text(text: &str, query: &Query) -> Vec<MatchedTextToken> {
328 let mut tokens = Vec::new();
329 let mut pos = 0usize;
330 let mut line_num = 1usize;
331
332 for line in text.split_inclusive('\n') {
333 for capture in MATCHED_TEXT_PATTERN.captures_iter(line) {
334 if let Some(token_match) = capture.name("token") {
335 let token_text = token_match.as_str();
336 let retokenized: Vec<String> = QUERY_PATTERN
337 .find_iter(&token_text.to_lowercase())
338 .map(|m| m.as_str().to_string())
339 .filter(|token| !STOPWORDS.contains(token.as_str()))
340 .collect();
341
342 if retokenized.is_empty() {
343 tokens.push(MatchedTextToken {
344 value: token_text.to_string(),
345 line_num,
346 pos: None,
347 is_text: true,
348 is_known: false,
349 is_matched: false,
350 });
351 } else if retokenized.len() == 1 {
352 let token = &retokenized[0];
353 let is_known = query.index.dictionary.get(token).is_some();
354 let token_pos = if is_known {
355 let current_pos = pos;
356 pos += 1;
357 Some(current_pos)
358 } else {
359 None
360 };
361
362 tokens.push(MatchedTextToken {
363 value: token_text.to_string(),
364 line_num,
365 pos: token_pos,
366 is_text: true,
367 is_known,
368 is_matched: false,
369 });
370 } else {
371 for token in retokenized {
372 let is_known = query.index.dictionary.get(&token).is_some();
373 let token_pos = if is_known {
374 let current_pos = pos;
375 pos += 1;
376 Some(current_pos)
377 } else {
378 None
379 };
380
381 tokens.push(MatchedTextToken {
382 value: token,
383 line_num,
384 pos: token_pos,
385 is_text: true,
386 is_known,
387 is_matched: false,
388 });
389 }
390 }
391 } else if let Some(punct_match) = capture.name("punct") {
392 tokens.push(MatchedTextToken {
393 value: punct_match.as_str().to_string(),
394 line_num,
395 pos: None,
396 is_text: false,
397 is_known: false,
398 is_matched: false,
399 });
400 }
401 }
402
403 line_num += 1;
404 }
405
406 tokens
407}
408
409fn collect_reportable_unknown_tokens(
410 tokens: Vec<MatchedTextToken>,
411 matched_positions: &PositionSet,
412 start_pos: usize,
413 end_pos: usize,
414 start_line: usize,
415 end_line: usize,
416) -> Vec<MatchedTextToken> {
417 let mut reportable = Vec::new();
418 let mut started = false;
419 let mut finished = false;
420 let mut end_real_pos = None;
421 let mut last_real_pos = None;
422
423 for (real_pos, mut token) in tokens.into_iter().enumerate() {
424 if token.line_num < start_line {
425 continue;
426 }
427
428 if token.line_num > end_line {
429 break;
430 }
431
432 let mut is_included = false;
433
434 if token
435 .pos
436 .is_some_and(|pos| token.is_known && matched_positions.contains(pos))
437 {
438 token.is_matched = true;
439 is_included = true;
440 }
441
442 if !started && token.pos == Some(start_pos) {
443 started = true;
444 is_included = true;
445 }
446
447 if started && !finished {
448 is_included = true;
449 }
450
451 if token.pos == Some(end_pos) {
452 finished = true;
453 started = false;
454 end_real_pos = Some(real_pos);
455 }
456
457 if finished && !started && end_real_pos.is_some() && last_real_pos == end_real_pos {
458 end_real_pos = None;
459 if !token.is_text && !token.value.trim().is_empty() {
460 is_included = true;
461 }
462 }
463
464 last_real_pos = Some(real_pos);
465
466 if is_included {
467 reportable.push(token);
468 }
469 }
470
471 reportable
472}
473
474fn collect_line_endings(text: &str) -> Vec<String> {
475 text.split_inclusive('\n')
476 .map(|line| {
477 if line.ends_with("\r\n") {
478 "\r\n".to_string()
479 } else if line.ends_with('\n') {
480 "\n".to_string()
481 } else {
482 String::new()
483 }
484 })
485 .collect()
486}
487
488fn render_unknown_rule_tokens(tokens: &[MatchedTextToken], line_endings: &[String]) -> String {
489 let mut rendered = String::new();
490 let mut previous_line: Option<usize> = None;
491
492 for token in tokens {
493 if let Some(prev_line) = previous_line
494 && token.line_num > prev_line
495 {
496 for line in prev_line..token.line_num {
497 if let Some(line_ending) = line_endings.get(line.saturating_sub(1)) {
498 rendered.push_str(line_ending.as_str());
499 }
500 }
501 }
502
503 let token_value = if token.is_text {
504 token.value.as_str()
505 } else {
506 token
507 .value
508 .strip_suffix("\r\n")
509 .or_else(|| token.value.strip_suffix('\n'))
510 .unwrap_or(token.value.as_str())
511 };
512
513 if token.is_text && !STOPWORDS.contains(token.value.to_lowercase().as_str()) {
514 if token.is_matched {
515 rendered.push_str(token_value);
516 } else {
517 rendered.push('.');
518 }
519 } else {
520 rendered.push_str(token_value);
521 }
522
523 previous_line = Some(token.line_num);
524 }
525
526 rendered
527}
528
529fn build_unknown_rule_identifier(rule_text: &str) -> String {
530 let content = format!("None{}", python_str_repr(rule_text));
531 let mut hasher = Sha1::new();
532 hasher.update(content.as_bytes());
533 let digest = hasher.finalize();
534
535 format!("license-detection-unknown-{}", hex::encode(digest))
536}
537
538fn python_str_repr(text: &str) -> String {
539 let use_double_quotes = text.contains('\'') && !text.contains('"');
540 let quote = if use_double_quotes { '"' } else { '\'' };
541 let mut escaped = String::with_capacity(text.len());
542
543 for ch in text.chars() {
544 match ch {
545 '\\' => escaped.push_str("\\\\"),
546 '\n' => escaped.push_str("\\n"),
547 '\r' => escaped.push_str("\\r"),
548 '\t' => escaped.push_str("\\t"),
549 '\'' if !use_double_quotes => escaped.push_str("\\'"),
550 '"' if use_double_quotes => escaped.push_str("\\\""),
551 _ => escaped.push(ch),
552 }
553 }
554
555 format!("{quote}{escaped}{quote}")
556}
557
558fn calculate_score(ngram_count: usize, match_len: usize) -> MatchScore {
559 if match_len == 0 {
560 return MatchScore::default();
561 }
562
563 let density = ngram_count as f64 / match_len as f64;
564
565 MatchScore::from_percentage(density.min(1.0) * 100.0)
566}
567
568#[cfg(test)]
569mod tests {
570 use super::*;
571 use crate::license_detection::index::LicenseIndex;
572 use crate::license_detection::index::dictionary::{TokenId, tid};
573 use crate::license_detection::query::Query;
574
575 fn tids(values: &[u16]) -> Vec<TokenId> {
576 values.iter().copied().map(tid).collect()
577 }
578
579 #[test]
580 fn test_unknown_match_empty_query() {
581 let index = LicenseIndex::with_legalese_count(10);
582 let query = Query::from_extracted_text("", &index, false).expect("Failed to create query");
583 let known_matches = vec![];
584
585 let matches = unknown_match(&index, &query, &known_matches);
586
587 assert!(matches.is_empty());
588 }
589
590 #[test]
591 fn test_find_unmatched_regions_no_coverage() {
592 let query_len = 10;
593 let covered_positions = PositionSet::new();
594
595 let regions = find_unmatched_regions(query_len, &covered_positions);
596
597 assert_eq!(regions, vec![(0, 10)]);
598 }
599
600 #[test]
601 fn test_find_unmatched_regions_full_coverage() {
602 let query_len = 10;
603 let covered_positions: PositionSet = (0..10).collect();
604
605 let regions = find_unmatched_regions(query_len, &covered_positions);
606
607 assert!(regions.is_empty());
608 }
609
610 #[test]
611 fn test_find_unmatched_regions_partial_coverage() {
612 let query_len = 20;
613 let covered_positions: PositionSet = [0, 1, 2, 12, 13, 14, 15, 16, 17, 18, 19]
614 .iter()
615 .copied()
616 .collect();
617
618 let regions = find_unmatched_regions(query_len, &covered_positions);
619
620 assert_eq!(regions.len(), 1);
621 assert_eq!(regions[0], (3, 12));
622 }
623
624 #[test]
625 fn test_find_unmatched_regions_trailing_unmatched() {
626 let query_len = 20;
627 let covered_positions: PositionSet = [0, 1, 2, 3, 4, 5].iter().copied().collect();
628
629 let regions = find_unmatched_regions(query_len, &covered_positions);
630
631 assert_eq!(regions.len(), 1);
632 assert_eq!(regions[0], (6, 20));
633 }
634
635 #[test]
636 fn test_compute_qspan_union_empty() {
637 let positions: Vec<(usize, usize)> = Vec::new();
638 let merged = compute_qspan_union(&positions);
639 assert!(merged.is_empty());
640 }
641
642 #[test]
643 fn test_compute_qspan_union_single() {
644 let positions = vec![(5, 11)];
645 let merged = compute_qspan_union(&positions);
646 assert_eq!(merged.len(), 6);
647 assert!(merged.contains(5));
648 assert!(merged.contains(10));
649 assert!(!merged.contains(4));
650 assert!(!merged.contains(11));
651 }
652
653 #[test]
654 fn test_compute_qspan_union_overlapping() {
655 let positions = vec![(5, 11), (8, 14), (20, 26)];
656 let merged = compute_qspan_union(&positions);
657 assert_eq!(merged.len(), 15);
658 assert!(merged.contains(5));
659 assert!(merged.contains(13));
660 assert!(!merged.contains(14));
661 assert!(merged.contains(20));
662 assert!(merged.contains(25));
663 assert!(!merged.contains(26));
664 }
665
666 #[test]
667 fn test_compute_qspan_union_adjacent() {
668 let positions = vec![(5, 11), (11, 17)];
669 let merged = compute_qspan_union(&positions);
670 assert_eq!(merged.len(), 12);
671 assert!(merged.contains(5));
672 assert!(merged.contains(16));
673 assert!(!merged.contains(4));
674 assert!(!merged.contains(17));
675 }
676
677 #[test]
678 fn test_compute_qspan_union_unsorted() {
679 let positions = vec![(20, 26), (5, 11), (8, 14)];
680 let merged = compute_qspan_union(&positions);
681 assert_eq!(merged.len(), 15);
682 assert!(merged.contains(5));
683 assert!(merged.contains(13));
684 assert!(merged.contains(20));
685 assert!(merged.contains(25));
686 }
687
688 #[test]
689 fn test_compute_hispan_from_qspan() {
690 let mut index = LicenseIndex::with_legalese_count(0);
691 let legalese_entries: Vec<(String, u16)> =
692 (0u16..15).map(|i| (format!("legalese-{i}"), i)).collect();
693 index.dictionary =
694 crate::license_detection::index::dictionary::TokenDictionary::new_with_legalese_pairs(
695 &legalese_entries
696 .iter()
697 .map(|(token, id)| (token.as_str(), *id))
698 .collect::<Vec<_>>(),
699 );
700
701 let mut tokens: Vec<TokenId> = (0..15)
702 .map(|i| {
703 index
704 .dictionary
705 .get_token_id(&format!("legalese-{i}"))
706 .unwrap()
707 })
708 .collect();
709 for i in 15..30 {
710 tokens.push(index.dictionary.get_or_assign(&format!("regular-{i}")));
711 }
712 let qspan: PositionSet = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24]
713 .iter()
714 .copied()
715 .collect();
716 let hispan = compute_hispan_from_qspan(&tokens, &qspan, &index);
717 assert_eq!(hispan, 10);
718 }
719
720 #[test]
721 fn test_get_matched_ngrams_empty_automaton() {
722 use crate::license_detection::automaton::AutomatonBuilder;
723
724 let tokens = tids(&[1, 2, 3, 4, 5, 6, 7, 8]);
725 let automaton = AutomatonBuilder::new().build();
726
727 let matches = get_matched_ngrams(&tokens, 0, 8, &automaton);
728
729 assert!(matches.is_empty());
730 }
731
732 #[test]
733 fn test_get_matched_ngrams_with_matches() {
734 use crate::license_detection::automaton::AutomatonBuilder;
735
736 let tokens: Vec<TokenId> = (0..30).map(tid).collect();
737 let ngram: Vec<u8> = vec![0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0];
738
739 let mut builder = AutomatonBuilder::new();
740 builder.add_pattern(&ngram);
741 let automaton = builder.build();
742
743 let matches = get_matched_ngrams(&tokens, 0, 30, &automaton);
744
745 assert!(!matches.is_empty(), "Should find ngram matches");
746
747 for (qstart, qend) in &matches {
748 assert_eq!(*qend - *qstart, UNKNOWN_NGRAM_LENGTH);
749 }
750 }
751
752 #[test]
753 fn test_get_matched_ngrams_keeps_overlapping_matches() {
754 use crate::license_detection::automaton::AutomatonBuilder;
755
756 let tokens = tids(&[1, 2, 3, 1, 2, 3, 1, 2, 3]);
757 let overlapping_ngram: Vec<u8> = tokens[..UNKNOWN_NGRAM_LENGTH]
758 .iter()
759 .flat_map(|tid| tid.to_le_bytes())
760 .collect();
761
762 let mut builder = AutomatonBuilder::new();
763 builder.add_pattern(&overlapping_ngram);
764 let automaton = builder.build();
765
766 let matches = get_matched_ngrams(&tokens, 0, tokens.len(), &automaton);
767
768 assert_eq!(matches, vec![(0, 6), (3, 9)]);
769 }
770
771 #[test]
772 fn test_calculate_score() {
773 let score1 = calculate_score(5, 10);
774 let score2 = calculate_score(10, 10);
775 let score3 = calculate_score(0, 10);
776
777 assert!(score2 > score1);
778 assert!(score2 <= MatchScore::MAX);
779 assert_eq!(score3, MatchScore::default());
780 }
781
782 #[test]
783 fn test_find_unmatched_regions_leading_unmatched() {
784 let query_len = 20;
785 let covered_positions: PositionSet = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
786 .iter()
787 .copied()
788 .collect();
789
790 let regions = find_unmatched_regions(query_len, &covered_positions);
791
792 assert_eq!(regions.len(), 1);
793 assert_eq!(regions[0], (0, 10));
794 }
795
796 #[test]
797 fn test_find_unmatched_regions_middle_gap() {
798 let query_len = 30;
799 let covered_positions: PositionSet =
800 [0, 1, 2, 3, 4, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
801 .iter()
802 .copied()
803 .collect();
804
805 let regions = find_unmatched_regions(query_len, &covered_positions);
806
807 assert_eq!(regions.len(), 1);
808 assert_eq!(regions[0], (5, 20));
809 }
810
811 #[test]
812 fn test_compute_covered_positions_gapped_qspan() {
813 let index = LicenseIndex::with_legalese_count(10);
814 let query = Query::from_extracted_text("some license text here", &index, false)
815 .expect("Failed to create query");
816
817 let known_matches = vec![LicenseMatch {
818 rid: 0,
819 license_expression: "test".to_string(),
820 license_expression_spdx: Some("TEST".to_string()),
821 from_file: None,
822 start_line: LineNumber::ONE,
823 end_line: LineNumber::ONE,
824 start_token: 0,
825 end_token: 10,
826 matcher: MatcherKind::Aho,
827 score: MatchScore::MAX,
828 matched_length: 6,
829 rule_length: 6,
830 match_coverage: 100.0,
831 rule_relevance: 100,
832 rule_identifier: "test-rule".to_string(),
833 rule_url: String::new(),
834 matched_text: Some("matched text".to_string()),
835 referenced_filenames: None,
836 rule_kind: crate::license_detection::models::RuleKind::None,
837 is_from_license: false,
838 rule_start_token: 0,
839 coordinates: MatchCoordinates::query_region(PositionSpan::from_positions(vec![
840 0, 1, 2, 7, 8, 9,
841 ])),
842 candidate_resemblance: 0.0,
843 candidate_containment: 0.0,
844 }];
845
846 let covered = compute_covered_positions(&query, &known_matches);
847
848 assert!(covered.contains(0), "Should contain position 0");
849 assert!(covered.contains(2), "Should contain position 2");
850 assert!(covered.contains(7), "Should contain position 7");
851 assert!(covered.contains(9), "Should contain position 9");
852 assert!(!covered.contains(3), "Should NOT contain position 3 (gap)");
853 assert!(!covered.contains(5), "Should NOT contain position 5 (gap)");
854 assert!(
855 !covered.contains(10),
856 "Should NOT contain position 10 (outside)"
857 );
858 }
859
860 #[test]
861 fn test_compute_covered_positions_fallback_contiguous() {
862 let index = LicenseIndex::with_legalese_count(10);
863 let query = Query::from_extracted_text("some license text here", &index, false)
864 .expect("Failed to create query");
865
866 let known_matches = vec![LicenseMatch {
867 rid: 0,
868 license_expression: "test".to_string(),
869 license_expression_spdx: Some("TEST".to_string()),
870 from_file: None,
871 start_line: LineNumber::ONE,
872 end_line: LineNumber::ONE,
873 start_token: 5,
874 end_token: 10,
875 matcher: MatcherKind::Aho,
876 score: MatchScore::MAX,
877 matched_length: 5,
878 rule_length: 5,
879 match_coverage: 100.0,
880 rule_relevance: 100,
881 rule_identifier: "test-rule".to_string(),
882 rule_url: String::new(),
883 matched_text: Some("matched text".to_string()),
884 referenced_filenames: None,
885 rule_kind: crate::license_detection::models::RuleKind::None,
886 is_from_license: false,
887 rule_start_token: 0,
888 coordinates: MatchCoordinates::query_region(PositionSpan::range(5, 10)),
889 candidate_resemblance: 0.0,
890 candidate_containment: 0.0,
891 }];
892
893 let covered = compute_covered_positions(&query, &known_matches);
894
895 assert!(covered.contains(5), "Should contain position 5");
896 assert!(covered.contains(7), "Should contain position 7");
897 assert!(covered.contains(9), "Should contain position 9");
898 assert!(
899 !covered.contains(4),
900 "Should NOT contain position 4 (before)"
901 );
902 assert!(
903 !covered.contains(10),
904 "Should NOT contain position 10 (after)"
905 );
906 }
907
908 #[test]
909 fn test_compute_covered_positions_qspan_creates_extra_unmatched_region() {
910 let index = LicenseIndex::with_legalese_count(10);
911 let query = Query::from_extracted_text("some license text here", &index, false)
912 .expect("Failed to create query");
913
914 let known_matches = vec![LicenseMatch {
915 rid: 0,
916 license_expression: "test".to_string(),
917 license_expression_spdx: Some("TEST".to_string()),
918 from_file: None,
919 start_line: LineNumber::ONE,
920 end_line: LineNumber::ONE,
921 start_token: 0,
922 end_token: 15,
923 matcher: MatcherKind::Aho,
924 score: MatchScore::MAX,
925 matched_length: 8,
926 rule_length: 8,
927 match_coverage: 100.0,
928 rule_relevance: 100,
929 rule_identifier: "test-rule".to_string(),
930 rule_url: String::new(),
931 matched_text: Some("matched text".to_string()),
932 referenced_filenames: None,
933 rule_kind: crate::license_detection::models::RuleKind::None,
934 is_from_license: false,
935 rule_start_token: 0,
936 coordinates: MatchCoordinates::query_region(PositionSpan::from_positions(vec![
937 0, 1, 2, 3, 11, 12, 13, 14,
938 ])),
939 candidate_resemblance: 0.0,
940 candidate_containment: 0.0,
941 }];
942
943 let covered = compute_covered_positions(&query, &known_matches);
944 let regions = find_unmatched_regions(20, &covered);
945
946 assert!(
947 regions.contains(&(4, 11)),
948 "Should have unmatched region 4-11 (the gap in qspan_positions), got: {:?}",
949 regions
950 );
951 assert!(
952 regions.contains(&(15, 20)),
953 "Should have trailing unmatched region 15-20, got: {:?}",
954 regions
955 );
956
957 let contiguous_covered: PositionSet = (0..15).collect();
958 let contiguous_regions = find_unmatched_regions(20, &contiguous_covered);
959 assert_eq!(
960 contiguous_regions,
961 vec![(15, 20)],
962 "Contiguous coverage would collapse the gap, producing only trailing region"
963 );
964 }
965
966 #[test]
967 fn test_create_unknown_match_from_qspan_valid() {
968 use crate::license_detection::test_utils::create_mock_query_with_tokens;
969
970 let index = LicenseIndex::with_legalese_count(10);
971
972 let tokens: Vec<u16> = (0..30).collect();
973 let query = create_mock_query_with_tokens(&tokens, &index);
974
975 let qspan: PositionSet = (0..30).collect();
976
977 let match_result = create_unknown_match_from_qspan(&query, &qspan);
978
979 assert!(
980 match_result.is_some(),
981 "Should create unknown match for sufficient length"
982 );
983
984 let m = match_result.unwrap();
985 assert_eq!(m.license_expression, "unknown");
986 assert_eq!(m.matcher, MATCH_UNKNOWN);
987 assert!(!m.coordinates.query_span().is_empty());
988 }
989
990 #[test]
991 fn test_unknown_match_with_known_matches() {
992 let index = LicenseIndex::with_legalese_count(10);
993 let text = "some text that is license related and should be detected";
994 let query =
995 Query::from_extracted_text(text, &index, false).expect("Failed to create query");
996
997 let known_matches = vec![LicenseMatch {
998 rid: 0,
999 license_expression: "mit".to_string(),
1000 license_expression_spdx: Some("MIT".to_string()),
1001 from_file: None,
1002 start_line: LineNumber::ONE,
1003 end_line: LineNumber::ONE,
1004 start_token: 0,
1005 end_token: 5,
1006 matcher: MatcherKind::Aho,
1007 score: MatchScore::MAX,
1008 matched_length: 5,
1009 rule_length: 5,
1010 match_coverage: 100.0,
1011 rule_relevance: 100,
1012 rule_identifier: "test-rule".to_string(),
1013 rule_url: String::new(),
1014 matched_text: Some("some text".to_string()),
1015 referenced_filenames: None,
1016 rule_kind: crate::license_detection::models::RuleKind::None,
1017 is_from_license: false,
1018 rule_start_token: 0,
1019 coordinates: MatchCoordinates::query_region(PositionSpan::range(0, 5)),
1020 candidate_resemblance: 0.0,
1021 candidate_containment: 0.0,
1022 }];
1023
1024 let matches = unknown_match(&index, &query, &known_matches);
1025
1026 assert!(
1027 matches.is_empty() || matches[0].start_line > LineNumber::ONE,
1028 "Should not re-detect known regions"
1029 );
1030 }
1031
1032 #[test]
1033 fn test_calculate_score_edge_cases() {
1034 let score_zero_length = calculate_score(10, 0);
1035 assert_eq!(
1036 score_zero_length,
1037 MatchScore::default(),
1038 "Zero length should have zero score"
1039 );
1040
1041 let score_zero_ngrams = calculate_score(0, 100);
1042 assert_eq!(
1043 score_zero_ngrams,
1044 MatchScore::default(),
1045 "Zero ngrams should have zero score"
1046 );
1047
1048 let score_high_density = calculate_score(100, 50);
1049 assert_eq!(
1050 score_high_density,
1051 MatchScore::MAX,
1052 "High density should be capped at 100.0"
1053 );
1054 }
1055
1056 #[test]
1057 fn test_get_matched_ngrams_out_of_bounds() {
1058 use crate::license_detection::automaton::AutomatonBuilder;
1059
1060 let tokens = tids(&[1, 2, 3]);
1061 let automaton = AutomatonBuilder::new().build();
1062
1063 let matches = get_matched_ngrams(&tokens, 5, 10, &automaton);
1064 assert!(matches.is_empty(), "Out of bounds should return empty");
1065
1066 let matches = get_matched_ngrams(&tokens, 2, 1, &automaton);
1067 assert!(matches.is_empty(), "Invalid range should return empty");
1068 }
1069}