1use crate::license_detection::automaton::Automaton;
7use regex::Regex;
8use sha1::{Digest, Sha1};
9use std::sync::LazyLock;
10
11use crate::license_detection::index::LicenseIndex;
12use crate::license_detection::index::dictionary::{TokenId, TokenKind};
13use crate::license_detection::models::position_span::PositionSpan;
14use crate::license_detection::models::{LicenseMatch, MatchCoordinates, MatcherKind};
15use crate::license_detection::position_set::PositionSet;
16use crate::license_detection::query::Query;
17use crate::license_detection::tokenize::STOPWORDS;
18use crate::models::LineNumber;
19use crate::models::MatchScore;
20
21pub const MATCH_UNKNOWN: MatcherKind = MatcherKind::Unknown;
22
23const UNKNOWN_NGRAM_LENGTH: usize = 6;
24
25const MIN_NGRAM_MATCHES: usize = 3;
26
27const MIN_REGION_LENGTH: usize = 5;
28
29static QUERY_PATTERN: LazyLock<Regex> =
30 LazyLock::new(|| Regex::new(r"[^_\W]+\+?[^_\W]*").expect("Invalid regex pattern"));
31static MATCHED_TEXT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
32 Regex::new(r"(?P<token>[^_\W]+\+?[^_\W]*)|(?P<punct>[_\W\s\+]+[_\W\s]?)")
33 .expect("Invalid matched text regex pattern")
34});
35
36#[derive(Clone)]
37struct MatchedTextToken {
38 value: String,
39 line_num: usize,
40 pos: Option<usize>,
41 is_text: bool,
42 is_known: bool,
43 is_matched: bool,
44}
45
46pub fn unknown_match(
47 index: &LicenseIndex,
48 query: &Query,
49 known_matches: &[LicenseMatch],
50) -> Vec<LicenseMatch> {
51 let mut unknown_matches = Vec::new();
52
53 if query.tokens.is_empty() {
54 return unknown_matches;
55 }
56
57 let query_len = query.tokens.len();
58
59 let covered_positions = compute_covered_positions(query, known_matches);
60
61 let unmatched_regions = find_unmatched_regions(query_len, &covered_positions);
62
63 let automaton = &index.unknown_automaton;
64
65 for region in unmatched_regions {
66 let start = region.0;
67 let end = region.1;
68
69 let region_length = end - start;
70 if region_length < MIN_REGION_LENGTH {
71 continue;
72 }
73
74 let matched_ngrams = get_matched_ngrams(&query.tokens, start, end, automaton);
75
76 if matched_ngrams.len() < MIN_NGRAM_MATCHES {
77 continue;
78 }
79
80 let qspan = compute_qspan_union(&matched_ngrams);
81
82 if qspan.is_empty() {
83 continue;
84 }
85
86 let qspan_length = qspan.len();
87
88 #[cfg(debug_assertions)]
90 {
91 eprintln!("\n=== UNKNOWN MATCH DEBUG ===");
92 eprintln!("Region: {}-{} ({} tokens)", start, end, region_length);
93 eprintln!("matched_ngrams: {} matches", matched_ngrams.len());
94 eprintln!("qspan: {:?}", qspan);
95 eprintln!(
96 "qspan_length: {} (threshold: {})",
97 qspan_length,
98 UNKNOWN_NGRAM_LENGTH * 4
99 );
100 }
101
102 if qspan_length < UNKNOWN_NGRAM_LENGTH * 4 {
103 continue;
104 }
105
106 let hispan = compute_hispan_from_qspan(&query.tokens, &qspan, index);
107
108 #[cfg(debug_assertions)]
109 {
110 eprintln!("hispan: {} (threshold: 5)", hispan);
111 }
112
113 if hispan < 5 {
114 continue;
115 }
116
117 if let Some(match_result) = create_unknown_match_from_qspan(query, &qspan) {
118 unknown_matches.push(match_result);
119 }
120 }
121
122 unknown_matches
123}
124
125fn compute_covered_positions(_query: &Query, known_matches: &[LicenseMatch]) -> PositionSet {
126 let mut covered = PositionSet::new();
127 for m in known_matches {
128 covered.extend_from_span(m.query_span());
129 }
130 covered
131}
132
133fn find_unmatched_regions(
134 query_len: usize,
135 covered_positions: &PositionSet,
136) -> Vec<(usize, usize)> {
137 let mut regions = Vec::new();
138
139 if query_len == 0 {
140 return regions;
141 }
142
143 let mut region_start = None;
144
145 for pos in 0..query_len {
146 if !covered_positions.contains(pos) {
147 if region_start.is_none() {
148 region_start = Some(pos);
149 }
150 } else if let Some(start) = region_start {
151 regions.push((start, pos));
152 region_start = None;
153 }
154 }
155
156 if let Some(start) = region_start {
157 regions.push((start, query_len));
158 }
159
160 regions
161}
162
163fn get_matched_ngrams(
164 tokens: &[TokenId],
165 start: usize,
166 end: usize,
167 automaton: &Automaton,
168) -> Vec<(usize, usize)> {
169 if start >= end || end > tokens.len() {
170 return Vec::new();
171 }
172
173 let region_tokens = &tokens[start..end];
174
175 let region_bytes: Vec<u8> = region_tokens
176 .iter()
177 .flat_map(|tid| tid.to_le_bytes())
178 .collect();
179
180 let offset = UNKNOWN_NGRAM_LENGTH;
181 let mut matches = Vec::new();
182
183 for m in automaton.find_overlapping_iter(®ion_bytes) {
184 let local_qend = m.end / 2;
185 let qend = start + local_qend;
186 let qstart = qend.saturating_sub(offset);
187 matches.push((qstart, qend));
188 }
189
190 matches
191}
192
193fn compute_qspan_union(positions: &[(usize, usize)]) -> PositionSet {
194 if positions.is_empty() {
195 return PositionSet::new();
196 }
197
198 let mut sorted: Vec<_> = positions.to_vec();
199 sorted.sort_by_key(|p| p.0);
200
201 let mut merged: Vec<(usize, usize)> = Vec::new();
202 let mut current = sorted[0];
203
204 for (start, end) in sorted.into_iter().skip(1) {
205 if start <= current.1 {
206 current.1 = current.1.max(end);
207 } else {
208 merged.push(current);
209 current = (start, end);
210 }
211 }
212 merged.push(current);
213
214 let mut result = PositionSet::new();
215 for (start, end) in merged {
216 result.extend_from_span(&PositionSpan::range(start, end));
217 }
218 result
219}
220
221fn compute_hispan_from_qspan(
222 tokens: &[TokenId],
223 qspan: &PositionSet,
224 index: &LicenseIndex,
225) -> usize {
226 qspan
227 .iter()
228 .filter(|&pos| {
229 tokens
230 .get(pos)
231 .is_some_and(|&tid| index.dictionary.token_kind(tid) == TokenKind::Legalese)
232 })
233 .count()
234}
235
236fn create_unknown_match_from_qspan(query: &Query, qspan: &PositionSet) -> Option<LicenseMatch> {
237 if qspan.is_empty() {
238 return None;
239 }
240
241 let match_len = qspan.len();
242
243 let start = qspan.min_pos();
244 let end = qspan.max_pos() + 1;
245
246 let start_line = query
247 .line_by_pos
248 .get(start)
249 .copied()
250 .and_then(LineNumber::new)
251 .unwrap_or(LineNumber::ONE);
252 let end_line = query
253 .line_by_pos
254 .get(end.saturating_sub(1))
255 .copied()
256 .and_then(LineNumber::new)
257 .unwrap_or(start_line);
258
259 let qspan_positions: Vec<usize> = qspan.iter().collect();
260 let synthetic_rule_text =
261 build_unknown_rule_text(query, &qspan_positions, start_line, end_line);
262 let rule_identifier = build_unknown_rule_identifier(&synthetic_rule_text);
263
264 let ngram_count = qspan.len();
265
266 let score = calculate_score(ngram_count, match_len);
267
268 let qspan_span = qspan.to_position_span();
269
270 LicenseMatch {
271 rid: 0,
272 license_expression: "unknown".to_string(),
273 license_expression_spdx: None,
274 from_file: None,
275 start_line,
276 end_line,
277 start_token: start,
278 end_token: end,
279 matcher: MATCH_UNKNOWN,
280 score,
281 matched_length: match_len,
282 rule_length: match_len,
283 match_coverage: 100.0,
284 rule_relevance: 50,
285 rule_identifier,
286 rule_url: String::new(),
287 matched_text: None,
288 referenced_filenames: None,
289 rule_kind: crate::license_detection::models::RuleKind::None,
290 is_from_license: false,
291 rule_start_token: 0,
292 coordinates: MatchCoordinates::query_region(qspan_span),
293 candidate_resemblance: 0.0,
294 candidate_containment: 0.0,
295 }
296 .into()
297}
298
299fn build_unknown_rule_text(
300 query: &Query,
301 qspan_positions: &[usize],
302 start_line: LineNumber,
303 end_line: LineNumber,
304) -> String {
305 let Some(&start_pos) = qspan_positions.first() else {
306 return String::new();
307 };
308 let Some(&end_pos) = qspan_positions.last() else {
309 return String::new();
310 };
311
312 let matched_positions: PositionSet = qspan_positions.iter().copied().collect();
313 let tokens = tokenize_matched_unknown_text(&query.text, query);
314 let reportable_tokens = collect_reportable_unknown_tokens(
315 tokens,
316 &matched_positions,
317 start_pos,
318 end_pos,
319 start_line.get(),
320 end_line.get(),
321 );
322 let line_endings = collect_line_endings(&query.text);
323
324 render_unknown_rule_tokens(&reportable_tokens, &line_endings)
325}
326
327fn tokenize_matched_unknown_text(text: &str, query: &Query) -> Vec<MatchedTextToken> {
328 let mut tokens = Vec::new();
329 let mut pos = 0usize;
330 for (line_num, line) in (1usize..).zip(text.split_inclusive('\n')) {
331 for capture in MATCHED_TEXT_PATTERN.captures_iter(line) {
332 if let Some(token_match) = capture.name("token") {
333 let token_text = token_match.as_str();
334 let retokenized: Vec<String> = QUERY_PATTERN
335 .find_iter(&token_text.to_lowercase())
336 .map(|m| m.as_str().to_string())
337 .filter(|token| !STOPWORDS.contains(token.as_str()))
338 .collect();
339
340 if retokenized.is_empty() {
341 tokens.push(MatchedTextToken {
342 value: token_text.to_string(),
343 line_num,
344 pos: None,
345 is_text: true,
346 is_known: false,
347 is_matched: false,
348 });
349 } else if retokenized.len() == 1 {
350 let token = &retokenized[0];
351 let is_known = query.index.dictionary.get(token).is_some();
352 let token_pos = if is_known {
353 let current_pos = pos;
354 pos += 1;
355 Some(current_pos)
356 } else {
357 None
358 };
359
360 tokens.push(MatchedTextToken {
361 value: token_text.to_string(),
362 line_num,
363 pos: token_pos,
364 is_text: true,
365 is_known,
366 is_matched: false,
367 });
368 } else {
369 for token in retokenized {
370 let is_known = query.index.dictionary.get(&token).is_some();
371 let token_pos = if is_known {
372 let current_pos = pos;
373 pos += 1;
374 Some(current_pos)
375 } else {
376 None
377 };
378
379 tokens.push(MatchedTextToken {
380 value: token,
381 line_num,
382 pos: token_pos,
383 is_text: true,
384 is_known,
385 is_matched: false,
386 });
387 }
388 }
389 } else if let Some(punct_match) = capture.name("punct") {
390 tokens.push(MatchedTextToken {
391 value: punct_match.as_str().to_string(),
392 line_num,
393 pos: None,
394 is_text: false,
395 is_known: false,
396 is_matched: false,
397 });
398 }
399 }
400 }
401
402 tokens
403}
404
405fn collect_reportable_unknown_tokens(
406 tokens: Vec<MatchedTextToken>,
407 matched_positions: &PositionSet,
408 start_pos: usize,
409 end_pos: usize,
410 start_line: usize,
411 end_line: usize,
412) -> Vec<MatchedTextToken> {
413 let mut reportable = Vec::new();
414 let mut started = false;
415 let mut finished = false;
416 let mut end_real_pos = None;
417 let mut last_real_pos = None;
418
419 for (real_pos, mut token) in tokens.into_iter().enumerate() {
420 if token.line_num < start_line {
421 continue;
422 }
423
424 if token.line_num > end_line {
425 break;
426 }
427
428 let mut is_included = false;
429
430 if token
431 .pos
432 .is_some_and(|pos| token.is_known && matched_positions.contains(pos))
433 {
434 token.is_matched = true;
435 is_included = true;
436 }
437
438 if !started && token.pos == Some(start_pos) {
439 started = true;
440 is_included = true;
441 }
442
443 if started && !finished {
444 is_included = true;
445 }
446
447 if token.pos == Some(end_pos) {
448 finished = true;
449 started = false;
450 end_real_pos = Some(real_pos);
451 }
452
453 if finished && !started && end_real_pos.is_some() && last_real_pos == end_real_pos {
454 end_real_pos = None;
455 if !token.is_text && !token.value.trim().is_empty() {
456 is_included = true;
457 }
458 }
459
460 last_real_pos = Some(real_pos);
461
462 if is_included {
463 reportable.push(token);
464 }
465 }
466
467 reportable
468}
469
470fn collect_line_endings(text: &str) -> Vec<String> {
471 text.split_inclusive('\n')
472 .map(|line| {
473 if line.ends_with("\r\n") {
474 "\r\n".to_string()
475 } else if line.ends_with('\n') {
476 "\n".to_string()
477 } else {
478 String::new()
479 }
480 })
481 .collect()
482}
483
484fn render_unknown_rule_tokens(tokens: &[MatchedTextToken], line_endings: &[String]) -> String {
485 let mut rendered = String::new();
486 let mut previous_line: Option<usize> = None;
487
488 for token in tokens {
489 if let Some(prev_line) = previous_line
490 && token.line_num > prev_line
491 {
492 for line in prev_line..token.line_num {
493 if let Some(line_ending) = line_endings.get(line.saturating_sub(1)) {
494 rendered.push_str(line_ending.as_str());
495 }
496 }
497 }
498
499 let token_value = if token.is_text {
500 token.value.as_str()
501 } else {
502 token
503 .value
504 .strip_suffix("\r\n")
505 .or_else(|| token.value.strip_suffix('\n'))
506 .unwrap_or(token.value.as_str())
507 };
508
509 if token.is_text && !STOPWORDS.contains(token.value.to_lowercase().as_str()) {
510 if token.is_matched {
511 rendered.push_str(token_value);
512 } else {
513 rendered.push('.');
514 }
515 } else {
516 rendered.push_str(token_value);
517 }
518
519 previous_line = Some(token.line_num);
520 }
521
522 rendered
523}
524
525fn build_unknown_rule_identifier(rule_text: &str) -> String {
526 let content = format!("None{}", python_str_repr(rule_text));
527 let mut hasher = Sha1::new();
528 hasher.update(content.as_bytes());
529 let digest = hasher.finalize();
530
531 format!("license-detection-unknown-{}", hex::encode(digest))
532}
533
534fn python_str_repr(text: &str) -> String {
535 let use_double_quotes = text.contains('\'') && !text.contains('"');
536 let quote = if use_double_quotes { '"' } else { '\'' };
537 let mut escaped = String::with_capacity(text.len());
538
539 for ch in text.chars() {
540 match ch {
541 '\\' => escaped.push_str("\\\\"),
542 '\n' => escaped.push_str("\\n"),
543 '\r' => escaped.push_str("\\r"),
544 '\t' => escaped.push_str("\\t"),
545 '\'' if !use_double_quotes => escaped.push_str("\\'"),
546 '"' if use_double_quotes => escaped.push_str("\\\""),
547 _ => escaped.push(ch),
548 }
549 }
550
551 format!("{quote}{escaped}{quote}")
552}
553
554fn calculate_score(ngram_count: usize, match_len: usize) -> MatchScore {
555 if match_len == 0 {
556 return MatchScore::default();
557 }
558
559 let density = ngram_count as f64 / match_len as f64;
560
561 MatchScore::from_percentage(density.min(1.0) * 100.0)
562}
563
564#[cfg(test)]
565mod tests {
566 use super::*;
567 use crate::license_detection::index::LicenseIndex;
568 use crate::license_detection::index::dictionary::{TokenId, tid};
569 use crate::license_detection::query::Query;
570
571 fn tids(values: &[u16]) -> Vec<TokenId> {
572 values.iter().copied().map(tid).collect()
573 }
574
575 #[test]
576 fn test_unknown_match_empty_query() {
577 let index = LicenseIndex::with_legalese_count(10);
578 let query = Query::from_extracted_text("", &index, false).expect("Failed to create query");
579 let known_matches = vec![];
580
581 let matches = unknown_match(&index, &query, &known_matches);
582
583 assert!(matches.is_empty());
584 }
585
586 #[test]
587 fn test_find_unmatched_regions_no_coverage() {
588 let query_len = 10;
589 let covered_positions = PositionSet::new();
590
591 let regions = find_unmatched_regions(query_len, &covered_positions);
592
593 assert_eq!(regions, vec![(0, 10)]);
594 }
595
596 #[test]
597 fn test_find_unmatched_regions_full_coverage() {
598 let query_len = 10;
599 let covered_positions: PositionSet = (0..10).collect();
600
601 let regions = find_unmatched_regions(query_len, &covered_positions);
602
603 assert!(regions.is_empty());
604 }
605
606 #[test]
607 fn test_find_unmatched_regions_partial_coverage() {
608 let query_len = 20;
609 let covered_positions: PositionSet = [0, 1, 2, 12, 13, 14, 15, 16, 17, 18, 19]
610 .iter()
611 .copied()
612 .collect();
613
614 let regions = find_unmatched_regions(query_len, &covered_positions);
615
616 assert_eq!(regions.len(), 1);
617 assert_eq!(regions[0], (3, 12));
618 }
619
620 #[test]
621 fn test_find_unmatched_regions_trailing_unmatched() {
622 let query_len = 20;
623 let covered_positions: PositionSet = [0, 1, 2, 3, 4, 5].iter().copied().collect();
624
625 let regions = find_unmatched_regions(query_len, &covered_positions);
626
627 assert_eq!(regions.len(), 1);
628 assert_eq!(regions[0], (6, 20));
629 }
630
631 #[test]
632 fn test_compute_qspan_union_empty() {
633 let positions: Vec<(usize, usize)> = Vec::new();
634 let merged = compute_qspan_union(&positions);
635 assert!(merged.is_empty());
636 }
637
638 #[test]
639 fn test_compute_qspan_union_single() {
640 let positions = vec![(5, 11)];
641 let merged = compute_qspan_union(&positions);
642 assert_eq!(merged.len(), 6);
643 assert!(merged.contains(5));
644 assert!(merged.contains(10));
645 assert!(!merged.contains(4));
646 assert!(!merged.contains(11));
647 }
648
649 #[test]
650 fn test_compute_qspan_union_overlapping() {
651 let positions = vec![(5, 11), (8, 14), (20, 26)];
652 let merged = compute_qspan_union(&positions);
653 assert_eq!(merged.len(), 15);
654 assert!(merged.contains(5));
655 assert!(merged.contains(13));
656 assert!(!merged.contains(14));
657 assert!(merged.contains(20));
658 assert!(merged.contains(25));
659 assert!(!merged.contains(26));
660 }
661
662 #[test]
663 fn test_compute_qspan_union_adjacent() {
664 let positions = vec![(5, 11), (11, 17)];
665 let merged = compute_qspan_union(&positions);
666 assert_eq!(merged.len(), 12);
667 assert!(merged.contains(5));
668 assert!(merged.contains(16));
669 assert!(!merged.contains(4));
670 assert!(!merged.contains(17));
671 }
672
673 #[test]
674 fn test_compute_qspan_union_unsorted() {
675 let positions = vec![(20, 26), (5, 11), (8, 14)];
676 let merged = compute_qspan_union(&positions);
677 assert_eq!(merged.len(), 15);
678 assert!(merged.contains(5));
679 assert!(merged.contains(13));
680 assert!(merged.contains(20));
681 assert!(merged.contains(25));
682 }
683
684 #[test]
685 fn test_compute_hispan_from_qspan() {
686 let mut index = LicenseIndex::with_legalese_count(0);
687 let legalese_entries: Vec<(String, u16)> =
688 (0u16..15).map(|i| (format!("legalese-{i}"), i)).collect();
689 index.dictionary =
690 crate::license_detection::index::dictionary::TokenDictionary::new_with_legalese_pairs(
691 &legalese_entries
692 .iter()
693 .map(|(token, id)| (token.as_str(), *id))
694 .collect::<Vec<_>>(),
695 );
696
697 let mut tokens: Vec<TokenId> = (0..15)
698 .map(|i| {
699 index
700 .dictionary
701 .get_token_id(&format!("legalese-{i}"))
702 .unwrap()
703 })
704 .collect();
705 for i in 15..30 {
706 tokens.push(index.dictionary.get_or_assign(&format!("regular-{i}")));
707 }
708 let qspan: PositionSet = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24]
709 .iter()
710 .copied()
711 .collect();
712 let hispan = compute_hispan_from_qspan(&tokens, &qspan, &index);
713 assert_eq!(hispan, 10);
714 }
715
716 #[test]
717 fn test_get_matched_ngrams_empty_automaton() {
718 use crate::license_detection::automaton::AutomatonBuilder;
719
720 let tokens = tids(&[1, 2, 3, 4, 5, 6, 7, 8]);
721 let automaton = AutomatonBuilder::new().build();
722
723 let matches = get_matched_ngrams(&tokens, 0, 8, &automaton);
724
725 assert!(matches.is_empty());
726 }
727
728 #[test]
729 fn test_get_matched_ngrams_with_matches() {
730 use crate::license_detection::automaton::AutomatonBuilder;
731
732 let tokens: Vec<TokenId> = (0..30).map(tid).collect();
733 let ngram: Vec<u8> = vec![0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0];
734
735 let mut builder = AutomatonBuilder::new();
736 builder.add_pattern(&ngram);
737 let automaton = builder.build();
738
739 let matches = get_matched_ngrams(&tokens, 0, 30, &automaton);
740
741 assert!(!matches.is_empty(), "Should find ngram matches");
742
743 for (qstart, qend) in &matches {
744 assert_eq!(*qend - *qstart, UNKNOWN_NGRAM_LENGTH);
745 }
746 }
747
748 #[test]
749 fn test_get_matched_ngrams_keeps_overlapping_matches() {
750 use crate::license_detection::automaton::AutomatonBuilder;
751
752 let tokens = tids(&[1, 2, 3, 1, 2, 3, 1, 2, 3]);
753 let overlapping_ngram: Vec<u8> = tokens[..UNKNOWN_NGRAM_LENGTH]
754 .iter()
755 .flat_map(|tid| tid.to_le_bytes())
756 .collect();
757
758 let mut builder = AutomatonBuilder::new();
759 builder.add_pattern(&overlapping_ngram);
760 let automaton = builder.build();
761
762 let matches = get_matched_ngrams(&tokens, 0, tokens.len(), &automaton);
763
764 assert_eq!(matches, vec![(0, 6), (3, 9)]);
765 }
766
767 #[test]
768 fn test_calculate_score() {
769 let score1 = calculate_score(5, 10);
770 let score2 = calculate_score(10, 10);
771 let score3 = calculate_score(0, 10);
772
773 assert!(score2 > score1);
774 assert!(score2 <= MatchScore::MAX);
775 assert_eq!(score3, MatchScore::default());
776 }
777
778 #[test]
779 fn test_find_unmatched_regions_leading_unmatched() {
780 let query_len = 20;
781 let covered_positions: PositionSet = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
782 .iter()
783 .copied()
784 .collect();
785
786 let regions = find_unmatched_regions(query_len, &covered_positions);
787
788 assert_eq!(regions.len(), 1);
789 assert_eq!(regions[0], (0, 10));
790 }
791
792 #[test]
793 fn test_find_unmatched_regions_middle_gap() {
794 let query_len = 30;
795 let covered_positions: PositionSet =
796 [0, 1, 2, 3, 4, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
797 .iter()
798 .copied()
799 .collect();
800
801 let regions = find_unmatched_regions(query_len, &covered_positions);
802
803 assert_eq!(regions.len(), 1);
804 assert_eq!(regions[0], (5, 20));
805 }
806
807 #[test]
808 fn test_compute_covered_positions_gapped_qspan() {
809 let index = LicenseIndex::with_legalese_count(10);
810 let query = Query::from_extracted_text("some license text here", &index, false)
811 .expect("Failed to create query");
812
813 let known_matches = vec![LicenseMatch {
814 rid: 0,
815 license_expression: "test".to_string(),
816 license_expression_spdx: Some("TEST".to_string()),
817 from_file: None,
818 start_line: LineNumber::ONE,
819 end_line: LineNumber::ONE,
820 start_token: 0,
821 end_token: 10,
822 matcher: MatcherKind::Aho,
823 score: MatchScore::MAX,
824 matched_length: 6,
825 rule_length: 6,
826 match_coverage: 100.0,
827 rule_relevance: 100,
828 rule_identifier: "test-rule".to_string(),
829 rule_url: String::new(),
830 matched_text: Some("matched text".to_string()),
831 referenced_filenames: None,
832 rule_kind: crate::license_detection::models::RuleKind::None,
833 is_from_license: false,
834 rule_start_token: 0,
835 coordinates: MatchCoordinates::query_region(PositionSpan::from_positions(vec![
836 0, 1, 2, 7, 8, 9,
837 ])),
838 candidate_resemblance: 0.0,
839 candidate_containment: 0.0,
840 }];
841
842 let covered = compute_covered_positions(&query, &known_matches);
843
844 assert!(covered.contains(0), "Should contain position 0");
845 assert!(covered.contains(2), "Should contain position 2");
846 assert!(covered.contains(7), "Should contain position 7");
847 assert!(covered.contains(9), "Should contain position 9");
848 assert!(!covered.contains(3), "Should NOT contain position 3 (gap)");
849 assert!(!covered.contains(5), "Should NOT contain position 5 (gap)");
850 assert!(
851 !covered.contains(10),
852 "Should NOT contain position 10 (outside)"
853 );
854 }
855
856 #[test]
857 fn test_compute_covered_positions_fallback_contiguous() {
858 let index = LicenseIndex::with_legalese_count(10);
859 let query = Query::from_extracted_text("some license text here", &index, false)
860 .expect("Failed to create query");
861
862 let known_matches = vec![LicenseMatch {
863 rid: 0,
864 license_expression: "test".to_string(),
865 license_expression_spdx: Some("TEST".to_string()),
866 from_file: None,
867 start_line: LineNumber::ONE,
868 end_line: LineNumber::ONE,
869 start_token: 5,
870 end_token: 10,
871 matcher: MatcherKind::Aho,
872 score: MatchScore::MAX,
873 matched_length: 5,
874 rule_length: 5,
875 match_coverage: 100.0,
876 rule_relevance: 100,
877 rule_identifier: "test-rule".to_string(),
878 rule_url: String::new(),
879 matched_text: Some("matched text".to_string()),
880 referenced_filenames: None,
881 rule_kind: crate::license_detection::models::RuleKind::None,
882 is_from_license: false,
883 rule_start_token: 0,
884 coordinates: MatchCoordinates::query_region(PositionSpan::range(5, 10)),
885 candidate_resemblance: 0.0,
886 candidate_containment: 0.0,
887 }];
888
889 let covered = compute_covered_positions(&query, &known_matches);
890
891 assert!(covered.contains(5), "Should contain position 5");
892 assert!(covered.contains(7), "Should contain position 7");
893 assert!(covered.contains(9), "Should contain position 9");
894 assert!(
895 !covered.contains(4),
896 "Should NOT contain position 4 (before)"
897 );
898 assert!(
899 !covered.contains(10),
900 "Should NOT contain position 10 (after)"
901 );
902 }
903
904 #[test]
905 fn test_compute_covered_positions_qspan_creates_extra_unmatched_region() {
906 let index = LicenseIndex::with_legalese_count(10);
907 let query = Query::from_extracted_text("some license text here", &index, false)
908 .expect("Failed to create query");
909
910 let known_matches = vec![LicenseMatch {
911 rid: 0,
912 license_expression: "test".to_string(),
913 license_expression_spdx: Some("TEST".to_string()),
914 from_file: None,
915 start_line: LineNumber::ONE,
916 end_line: LineNumber::ONE,
917 start_token: 0,
918 end_token: 15,
919 matcher: MatcherKind::Aho,
920 score: MatchScore::MAX,
921 matched_length: 8,
922 rule_length: 8,
923 match_coverage: 100.0,
924 rule_relevance: 100,
925 rule_identifier: "test-rule".to_string(),
926 rule_url: String::new(),
927 matched_text: Some("matched text".to_string()),
928 referenced_filenames: None,
929 rule_kind: crate::license_detection::models::RuleKind::None,
930 is_from_license: false,
931 rule_start_token: 0,
932 coordinates: MatchCoordinates::query_region(PositionSpan::from_positions(vec![
933 0, 1, 2, 3, 11, 12, 13, 14,
934 ])),
935 candidate_resemblance: 0.0,
936 candidate_containment: 0.0,
937 }];
938
939 let covered = compute_covered_positions(&query, &known_matches);
940 let regions = find_unmatched_regions(20, &covered);
941
942 assert!(
943 regions.contains(&(4, 11)),
944 "Should have unmatched region 4-11 (the gap in qspan_positions), got: {:?}",
945 regions
946 );
947 assert!(
948 regions.contains(&(15, 20)),
949 "Should have trailing unmatched region 15-20, got: {:?}",
950 regions
951 );
952
953 let contiguous_covered: PositionSet = (0..15).collect();
954 let contiguous_regions = find_unmatched_regions(20, &contiguous_covered);
955 assert_eq!(
956 contiguous_regions,
957 vec![(15, 20)],
958 "Contiguous coverage would collapse the gap, producing only trailing region"
959 );
960 }
961
962 #[test]
963 fn test_create_unknown_match_from_qspan_valid() {
964 use crate::license_detection::test_utils::create_mock_query_with_tokens;
965
966 let index = LicenseIndex::with_legalese_count(10);
967
968 let tokens: Vec<u16> = (0..30).collect();
969 let query = create_mock_query_with_tokens(&tokens, &index);
970
971 let qspan: PositionSet = (0..30).collect();
972
973 let match_result = create_unknown_match_from_qspan(&query, &qspan);
974
975 assert!(
976 match_result.is_some(),
977 "Should create unknown match for sufficient length"
978 );
979
980 let m = match_result.unwrap();
981 assert_eq!(m.license_expression, "unknown");
982 assert_eq!(m.matcher, MATCH_UNKNOWN);
983 assert!(!m.coordinates.query_span().is_empty());
984 }
985
986 #[test]
987 fn test_unknown_match_with_known_matches() {
988 let index = LicenseIndex::with_legalese_count(10);
989 let text = "some text that is license related and should be detected";
990 let query =
991 Query::from_extracted_text(text, &index, false).expect("Failed to create query");
992
993 let known_matches = vec![LicenseMatch {
994 rid: 0,
995 license_expression: "mit".to_string(),
996 license_expression_spdx: Some("MIT".to_string()),
997 from_file: None,
998 start_line: LineNumber::ONE,
999 end_line: LineNumber::ONE,
1000 start_token: 0,
1001 end_token: 5,
1002 matcher: MatcherKind::Aho,
1003 score: MatchScore::MAX,
1004 matched_length: 5,
1005 rule_length: 5,
1006 match_coverage: 100.0,
1007 rule_relevance: 100,
1008 rule_identifier: "test-rule".to_string(),
1009 rule_url: String::new(),
1010 matched_text: Some("some text".to_string()),
1011 referenced_filenames: None,
1012 rule_kind: crate::license_detection::models::RuleKind::None,
1013 is_from_license: false,
1014 rule_start_token: 0,
1015 coordinates: MatchCoordinates::query_region(PositionSpan::range(0, 5)),
1016 candidate_resemblance: 0.0,
1017 candidate_containment: 0.0,
1018 }];
1019
1020 let matches = unknown_match(&index, &query, &known_matches);
1021
1022 assert!(
1023 matches.is_empty() || matches[0].start_line > LineNumber::ONE,
1024 "Should not re-detect known regions"
1025 );
1026 }
1027
1028 #[test]
1029 fn test_calculate_score_edge_cases() {
1030 let score_zero_length = calculate_score(10, 0);
1031 assert_eq!(
1032 score_zero_length,
1033 MatchScore::default(),
1034 "Zero length should have zero score"
1035 );
1036
1037 let score_zero_ngrams = calculate_score(0, 100);
1038 assert_eq!(
1039 score_zero_ngrams,
1040 MatchScore::default(),
1041 "Zero ngrams should have zero score"
1042 );
1043
1044 let score_high_density = calculate_score(100, 50);
1045 assert_eq!(
1046 score_high_density,
1047 MatchScore::MAX,
1048 "High density should be capped at 100.0"
1049 );
1050 }
1051
1052 #[test]
1053 fn test_get_matched_ngrams_out_of_bounds() {
1054 use crate::license_detection::automaton::AutomatonBuilder;
1055
1056 let tokens = tids(&[1, 2, 3]);
1057 let automaton = AutomatonBuilder::new().build();
1058
1059 let matches = get_matched_ngrams(&tokens, 5, 10, &automaton);
1060 assert!(matches.is_empty(), "Out of bounds should return empty");
1061
1062 let matches = get_matched_ngrams(&tokens, 2, 1, &automaton);
1063 assert!(matches.is_empty(), "Invalid range should return empty");
1064 }
1065}