1use crate::license_detection::automaton::Automaton;
4use regex::Regex;
5use sha1::{Digest, Sha1};
6use std::sync::LazyLock;
7
8use crate::license_detection::index::LicenseIndex;
9use crate::license_detection::index::dictionary::{TokenId, TokenKind};
10use crate::license_detection::models::position_span::PositionSpan;
11use crate::license_detection::models::{LicenseMatch, MatchCoordinates, MatcherKind};
12use crate::license_detection::position_set::PositionSet;
13use crate::license_detection::query::Query;
14use crate::license_detection::tokenize::STOPWORDS;
15use crate::models::LineNumber;
16use crate::models::MatchScore;
17
18pub const MATCH_UNKNOWN: MatcherKind = MatcherKind::Unknown;
19
20const UNKNOWN_NGRAM_LENGTH: usize = 6;
21
22const MIN_NGRAM_MATCHES: usize = 3;
23
24const MIN_REGION_LENGTH: usize = 5;
25
26static QUERY_PATTERN: LazyLock<Regex> =
27 LazyLock::new(|| Regex::new(r"[^_\W]+\+?[^_\W]*").expect("Invalid regex pattern"));
28static MATCHED_TEXT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
29 Regex::new(r"(?P<token>[^_\W]+\+?[^_\W]*)|(?P<punct>[_\W\s\+]+[_\W\s]?)")
30 .expect("Invalid matched text regex pattern")
31});
32
33#[derive(Clone)]
34struct MatchedTextToken {
35 value: String,
36 line_num: usize,
37 pos: Option<usize>,
38 is_text: bool,
39 is_known: bool,
40 is_matched: bool,
41}
42
43pub fn unknown_match(
44 index: &LicenseIndex,
45 query: &Query,
46 known_matches: &[LicenseMatch],
47) -> Vec<LicenseMatch> {
48 let mut unknown_matches = Vec::new();
49
50 if query.tokens.is_empty() {
51 return unknown_matches;
52 }
53
54 let query_len = query.tokens.len();
55
56 let covered_positions = compute_covered_positions(query, known_matches);
57
58 let unmatched_regions = find_unmatched_regions(query_len, &covered_positions);
59
60 let automaton = &index.unknown_automaton;
61
62 for region in unmatched_regions {
63 let start = region.0;
64 let end = region.1;
65
66 let region_length = end - start;
67 if region_length < MIN_REGION_LENGTH {
68 continue;
69 }
70
71 let matched_ngrams = get_matched_ngrams(&query.tokens, start, end, automaton);
72
73 if matched_ngrams.len() < MIN_NGRAM_MATCHES {
74 continue;
75 }
76
77 let qspan = compute_qspan_union(&matched_ngrams);
78
79 if qspan.is_empty() {
80 continue;
81 }
82
83 let qspan_length = qspan.len();
84
85 #[cfg(debug_assertions)]
87 {
88 eprintln!("\n=== UNKNOWN MATCH DEBUG ===");
89 eprintln!("Region: {}-{} ({} tokens)", start, end, region_length);
90 eprintln!("matched_ngrams: {} matches", matched_ngrams.len());
91 eprintln!("qspan: {:?}", qspan);
92 eprintln!(
93 "qspan_length: {} (threshold: {})",
94 qspan_length,
95 UNKNOWN_NGRAM_LENGTH * 4
96 );
97 }
98
99 if qspan_length < UNKNOWN_NGRAM_LENGTH * 4 {
100 continue;
101 }
102
103 let hispan = compute_hispan_from_qspan(&query.tokens, &qspan, index);
104
105 #[cfg(debug_assertions)]
106 {
107 eprintln!("hispan: {} (threshold: 5)", hispan);
108 }
109
110 if hispan < 5 {
111 continue;
112 }
113
114 if let Some(match_result) = create_unknown_match_from_qspan(query, &qspan) {
115 unknown_matches.push(match_result);
116 }
117 }
118
119 unknown_matches
120}
121
122fn compute_covered_positions(_query: &Query, known_matches: &[LicenseMatch]) -> PositionSet {
123 let mut covered = PositionSet::new();
124 for m in known_matches {
125 covered.extend_from_span(m.query_span());
126 }
127 covered
128}
129
130fn find_unmatched_regions(
131 query_len: usize,
132 covered_positions: &PositionSet,
133) -> Vec<(usize, usize)> {
134 let mut regions = Vec::new();
135
136 if query_len == 0 {
137 return regions;
138 }
139
140 let mut region_start = None;
141
142 for pos in 0..query_len {
143 if !covered_positions.contains(pos) {
144 if region_start.is_none() {
145 region_start = Some(pos);
146 }
147 } else if let Some(start) = region_start {
148 regions.push((start, pos));
149 region_start = None;
150 }
151 }
152
153 if let Some(start) = region_start {
154 regions.push((start, query_len));
155 }
156
157 regions
158}
159
160fn get_matched_ngrams(
161 tokens: &[TokenId],
162 start: usize,
163 end: usize,
164 automaton: &Automaton,
165) -> Vec<(usize, usize)> {
166 if start >= end || end > tokens.len() {
167 return Vec::new();
168 }
169
170 let region_tokens = &tokens[start..end];
171
172 let region_bytes: Vec<u8> = region_tokens
173 .iter()
174 .flat_map(|tid| tid.to_le_bytes())
175 .collect();
176
177 let offset = UNKNOWN_NGRAM_LENGTH;
178 let mut matches = Vec::new();
179
180 for m in automaton.find_overlapping_iter(®ion_bytes) {
181 let local_qend = m.end / 2;
182 let qend = start + local_qend;
183 let qstart = qend.saturating_sub(offset);
184 matches.push((qstart, qend));
185 }
186
187 matches
188}
189
190fn compute_qspan_union(positions: &[(usize, usize)]) -> PositionSet {
191 if positions.is_empty() {
192 return PositionSet::new();
193 }
194
195 let mut sorted: Vec<_> = positions.to_vec();
196 sorted.sort_by_key(|p| p.0);
197
198 let mut merged: Vec<(usize, usize)> = Vec::new();
199 let mut current = sorted[0];
200
201 for (start, end) in sorted.into_iter().skip(1) {
202 if start <= current.1 {
203 current.1 = current.1.max(end);
204 } else {
205 merged.push(current);
206 current = (start, end);
207 }
208 }
209 merged.push(current);
210
211 let mut result = PositionSet::new();
212 for (start, end) in merged {
213 result.extend_from_span(&PositionSpan::range(start, end));
214 }
215 result
216}
217
218fn compute_hispan_from_qspan(
219 tokens: &[TokenId],
220 qspan: &PositionSet,
221 index: &LicenseIndex,
222) -> usize {
223 qspan
224 .iter()
225 .filter(|&pos| {
226 tokens
227 .get(pos)
228 .is_some_and(|&tid| index.dictionary.token_kind(tid) == TokenKind::Legalese)
229 })
230 .count()
231}
232
233fn create_unknown_match_from_qspan(query: &Query, qspan: &PositionSet) -> Option<LicenseMatch> {
234 if qspan.is_empty() {
235 return None;
236 }
237
238 let match_len = qspan.len();
239
240 let start = qspan.min_pos();
241 let end = qspan.max_pos() + 1;
242
243 let start_line = query
244 .line_by_pos
245 .get(start)
246 .copied()
247 .and_then(LineNumber::new)
248 .unwrap_or(LineNumber::ONE);
249 let end_line = query
250 .line_by_pos
251 .get(end.saturating_sub(1))
252 .copied()
253 .and_then(LineNumber::new)
254 .unwrap_or(start_line);
255
256 let qspan_positions: Vec<usize> = qspan.iter().collect();
257 let synthetic_rule_text =
258 build_unknown_rule_text(query, &qspan_positions, start_line, end_line);
259 let rule_identifier = build_unknown_rule_identifier(&synthetic_rule_text);
260
261 let ngram_count = qspan.len();
262
263 let score = calculate_score(ngram_count, match_len);
264
265 let qspan_span = qspan.to_position_span();
266
267 LicenseMatch {
268 rid: 0,
269 license_expression: "unknown".to_string(),
270 license_expression_spdx: None,
271 from_file: None,
272 start_line,
273 end_line,
274 start_token: start,
275 end_token: end,
276 matcher: MATCH_UNKNOWN,
277 score,
278 matched_length: match_len,
279 rule_length: match_len,
280 match_coverage: 100.0,
281 rule_relevance: 50,
282 rule_identifier,
283 rule_url: String::new(),
284 matched_text: None,
285 referenced_filenames: None,
286 rule_kind: crate::license_detection::models::RuleKind::None,
287 is_from_license: false,
288 rule_start_token: 0,
289 coordinates: MatchCoordinates::query_region(qspan_span),
290 candidate_resemblance: 0.0,
291 candidate_containment: 0.0,
292 }
293 .into()
294}
295
296fn build_unknown_rule_text(
297 query: &Query,
298 qspan_positions: &[usize],
299 start_line: LineNumber,
300 end_line: LineNumber,
301) -> String {
302 let Some(&start_pos) = qspan_positions.first() else {
303 return String::new();
304 };
305 let Some(&end_pos) = qspan_positions.last() else {
306 return String::new();
307 };
308
309 let matched_positions: PositionSet = qspan_positions.iter().copied().collect();
310 let tokens = tokenize_matched_unknown_text(&query.text, query);
311 let reportable_tokens = collect_reportable_unknown_tokens(
312 tokens,
313 &matched_positions,
314 start_pos,
315 end_pos,
316 start_line.get(),
317 end_line.get(),
318 );
319 let line_endings = collect_line_endings(&query.text);
320
321 render_unknown_rule_tokens(&reportable_tokens, &line_endings)
322}
323
324fn tokenize_matched_unknown_text(text: &str, query: &Query) -> Vec<MatchedTextToken> {
325 let mut tokens = Vec::new();
326 let mut pos = 0usize;
327 let mut line_num = 1usize;
328
329 for line in text.split_inclusive('\n') {
330 for capture in MATCHED_TEXT_PATTERN.captures_iter(line) {
331 if let Some(token_match) = capture.name("token") {
332 let token_text = token_match.as_str();
333 let retokenized: Vec<String> = QUERY_PATTERN
334 .find_iter(&token_text.to_lowercase())
335 .map(|m| m.as_str().to_string())
336 .filter(|token| !STOPWORDS.contains(token.as_str()))
337 .collect();
338
339 if retokenized.is_empty() {
340 tokens.push(MatchedTextToken {
341 value: token_text.to_string(),
342 line_num,
343 pos: None,
344 is_text: true,
345 is_known: false,
346 is_matched: false,
347 });
348 } else if retokenized.len() == 1 {
349 let token = &retokenized[0];
350 let is_known = query.index.dictionary.get(token).is_some();
351 let token_pos = if is_known {
352 let current_pos = pos;
353 pos += 1;
354 Some(current_pos)
355 } else {
356 None
357 };
358
359 tokens.push(MatchedTextToken {
360 value: token_text.to_string(),
361 line_num,
362 pos: token_pos,
363 is_text: true,
364 is_known,
365 is_matched: false,
366 });
367 } else {
368 for token in retokenized {
369 let is_known = query.index.dictionary.get(&token).is_some();
370 let token_pos = if is_known {
371 let current_pos = pos;
372 pos += 1;
373 Some(current_pos)
374 } else {
375 None
376 };
377
378 tokens.push(MatchedTextToken {
379 value: token,
380 line_num,
381 pos: token_pos,
382 is_text: true,
383 is_known,
384 is_matched: false,
385 });
386 }
387 }
388 } else if let Some(punct_match) = capture.name("punct") {
389 tokens.push(MatchedTextToken {
390 value: punct_match.as_str().to_string(),
391 line_num,
392 pos: None,
393 is_text: false,
394 is_known: false,
395 is_matched: false,
396 });
397 }
398 }
399
400 line_num += 1;
401 }
402
403 tokens
404}
405
406fn collect_reportable_unknown_tokens(
407 tokens: Vec<MatchedTextToken>,
408 matched_positions: &PositionSet,
409 start_pos: usize,
410 end_pos: usize,
411 start_line: usize,
412 end_line: usize,
413) -> Vec<MatchedTextToken> {
414 let mut reportable = Vec::new();
415 let mut started = false;
416 let mut finished = false;
417 let mut end_real_pos = None;
418 let mut last_real_pos = None;
419
420 for (real_pos, mut token) in tokens.into_iter().enumerate() {
421 if token.line_num < start_line {
422 continue;
423 }
424
425 if token.line_num > end_line {
426 break;
427 }
428
429 let mut is_included = false;
430
431 if token
432 .pos
433 .is_some_and(|pos| token.is_known && matched_positions.contains(pos))
434 {
435 token.is_matched = true;
436 is_included = true;
437 }
438
439 if !started && token.pos == Some(start_pos) {
440 started = true;
441 is_included = true;
442 }
443
444 if started && !finished {
445 is_included = true;
446 }
447
448 if token.pos == Some(end_pos) {
449 finished = true;
450 started = false;
451 end_real_pos = Some(real_pos);
452 }
453
454 if finished && !started && end_real_pos.is_some() && last_real_pos == end_real_pos {
455 end_real_pos = None;
456 if !token.is_text && !token.value.trim().is_empty() {
457 is_included = true;
458 }
459 }
460
461 last_real_pos = Some(real_pos);
462
463 if is_included {
464 reportable.push(token);
465 }
466 }
467
468 reportable
469}
470
471fn collect_line_endings(text: &str) -> Vec<String> {
472 text.split_inclusive('\n')
473 .map(|line| {
474 if line.ends_with("\r\n") {
475 "\r\n".to_string()
476 } else if line.ends_with('\n') {
477 "\n".to_string()
478 } else {
479 String::new()
480 }
481 })
482 .collect()
483}
484
485fn render_unknown_rule_tokens(tokens: &[MatchedTextToken], line_endings: &[String]) -> String {
486 let mut rendered = String::new();
487 let mut previous_line: Option<usize> = None;
488
489 for token in tokens {
490 if let Some(prev_line) = previous_line
491 && token.line_num > prev_line
492 {
493 for line in prev_line..token.line_num {
494 if let Some(line_ending) = line_endings.get(line.saturating_sub(1)) {
495 rendered.push_str(line_ending.as_str());
496 }
497 }
498 }
499
500 let token_value = if token.is_text {
501 token.value.as_str()
502 } else {
503 token
504 .value
505 .strip_suffix("\r\n")
506 .or_else(|| token.value.strip_suffix('\n'))
507 .unwrap_or(token.value.as_str())
508 };
509
510 if token.is_text && !STOPWORDS.contains(token.value.to_lowercase().as_str()) {
511 if token.is_matched {
512 rendered.push_str(token_value);
513 } else {
514 rendered.push('.');
515 }
516 } else {
517 rendered.push_str(token_value);
518 }
519
520 previous_line = Some(token.line_num);
521 }
522
523 rendered
524}
525
526fn build_unknown_rule_identifier(rule_text: &str) -> String {
527 let content = format!("None{}", python_str_repr(rule_text));
528 let mut hasher = Sha1::new();
529 hasher.update(content.as_bytes());
530 let digest = hasher.finalize();
531
532 format!("license-detection-unknown-{}", hex::encode(digest))
533}
534
535fn python_str_repr(text: &str) -> String {
536 let use_double_quotes = text.contains('\'') && !text.contains('"');
537 let quote = if use_double_quotes { '"' } else { '\'' };
538 let mut escaped = String::with_capacity(text.len());
539
540 for ch in text.chars() {
541 match ch {
542 '\\' => escaped.push_str("\\\\"),
543 '\n' => escaped.push_str("\\n"),
544 '\r' => escaped.push_str("\\r"),
545 '\t' => escaped.push_str("\\t"),
546 '\'' if !use_double_quotes => escaped.push_str("\\'"),
547 '"' if use_double_quotes => escaped.push_str("\\\""),
548 _ => escaped.push(ch),
549 }
550 }
551
552 format!("{quote}{escaped}{quote}")
553}
554
555fn calculate_score(ngram_count: usize, match_len: usize) -> MatchScore {
556 if match_len == 0 {
557 return MatchScore::default();
558 }
559
560 let density = ngram_count as f64 / match_len as f64;
561
562 MatchScore::from_percentage(density.min(1.0) * 100.0)
563}
564
565#[cfg(test)]
566mod tests {
567 use super::*;
568 use crate::license_detection::index::LicenseIndex;
569 use crate::license_detection::index::dictionary::{TokenId, tid};
570 use crate::license_detection::query::Query;
571
572 fn tids(values: &[u16]) -> Vec<TokenId> {
573 values.iter().copied().map(tid).collect()
574 }
575
576 #[test]
577 fn test_unknown_match_empty_query() {
578 let index = LicenseIndex::with_legalese_count(10);
579 let query = Query::from_extracted_text("", &index, false).expect("Failed to create query");
580 let known_matches = vec![];
581
582 let matches = unknown_match(&index, &query, &known_matches);
583
584 assert!(matches.is_empty());
585 }
586
587 #[test]
588 fn test_find_unmatched_regions_no_coverage() {
589 let query_len = 10;
590 let covered_positions = PositionSet::new();
591
592 let regions = find_unmatched_regions(query_len, &covered_positions);
593
594 assert_eq!(regions, vec![(0, 10)]);
595 }
596
597 #[test]
598 fn test_find_unmatched_regions_full_coverage() {
599 let query_len = 10;
600 let covered_positions: PositionSet = (0..10).collect();
601
602 let regions = find_unmatched_regions(query_len, &covered_positions);
603
604 assert!(regions.is_empty());
605 }
606
607 #[test]
608 fn test_find_unmatched_regions_partial_coverage() {
609 let query_len = 20;
610 let covered_positions: PositionSet = [0, 1, 2, 12, 13, 14, 15, 16, 17, 18, 19]
611 .iter()
612 .copied()
613 .collect();
614
615 let regions = find_unmatched_regions(query_len, &covered_positions);
616
617 assert_eq!(regions.len(), 1);
618 assert_eq!(regions[0], (3, 12));
619 }
620
621 #[test]
622 fn test_find_unmatched_regions_trailing_unmatched() {
623 let query_len = 20;
624 let covered_positions: PositionSet = [0, 1, 2, 3, 4, 5].iter().copied().collect();
625
626 let regions = find_unmatched_regions(query_len, &covered_positions);
627
628 assert_eq!(regions.len(), 1);
629 assert_eq!(regions[0], (6, 20));
630 }
631
632 #[test]
633 fn test_compute_qspan_union_empty() {
634 let positions: Vec<(usize, usize)> = Vec::new();
635 let merged = compute_qspan_union(&positions);
636 assert!(merged.is_empty());
637 }
638
639 #[test]
640 fn test_compute_qspan_union_single() {
641 let positions = vec![(5, 11)];
642 let merged = compute_qspan_union(&positions);
643 assert_eq!(merged.len(), 6);
644 assert!(merged.contains(5));
645 assert!(merged.contains(10));
646 assert!(!merged.contains(4));
647 assert!(!merged.contains(11));
648 }
649
650 #[test]
651 fn test_compute_qspan_union_overlapping() {
652 let positions = vec![(5, 11), (8, 14), (20, 26)];
653 let merged = compute_qspan_union(&positions);
654 assert_eq!(merged.len(), 15);
655 assert!(merged.contains(5));
656 assert!(merged.contains(13));
657 assert!(!merged.contains(14));
658 assert!(merged.contains(20));
659 assert!(merged.contains(25));
660 assert!(!merged.contains(26));
661 }
662
663 #[test]
664 fn test_compute_qspan_union_adjacent() {
665 let positions = vec![(5, 11), (11, 17)];
666 let merged = compute_qspan_union(&positions);
667 assert_eq!(merged.len(), 12);
668 assert!(merged.contains(5));
669 assert!(merged.contains(16));
670 assert!(!merged.contains(4));
671 assert!(!merged.contains(17));
672 }
673
674 #[test]
675 fn test_compute_qspan_union_unsorted() {
676 let positions = vec![(20, 26), (5, 11), (8, 14)];
677 let merged = compute_qspan_union(&positions);
678 assert_eq!(merged.len(), 15);
679 assert!(merged.contains(5));
680 assert!(merged.contains(13));
681 assert!(merged.contains(20));
682 assert!(merged.contains(25));
683 }
684
685 #[test]
686 fn test_compute_hispan_from_qspan() {
687 let mut index = LicenseIndex::with_legalese_count(0);
688 let legalese_entries: Vec<(String, u16)> =
689 (0u16..15).map(|i| (format!("legalese-{i}"), i)).collect();
690 index.dictionary =
691 crate::license_detection::index::dictionary::TokenDictionary::new_with_legalese_pairs(
692 &legalese_entries
693 .iter()
694 .map(|(token, id)| (token.as_str(), *id))
695 .collect::<Vec<_>>(),
696 );
697
698 let mut tokens: Vec<TokenId> = (0..15)
699 .map(|i| {
700 index
701 .dictionary
702 .get_token_id(&format!("legalese-{i}"))
703 .unwrap()
704 })
705 .collect();
706 for i in 15..30 {
707 tokens.push(index.dictionary.get_or_assign(&format!("regular-{i}")));
708 }
709 let qspan: PositionSet = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24]
710 .iter()
711 .copied()
712 .collect();
713 let hispan = compute_hispan_from_qspan(&tokens, &qspan, &index);
714 assert_eq!(hispan, 10);
715 }
716
717 #[test]
718 fn test_get_matched_ngrams_empty_automaton() {
719 use crate::license_detection::automaton::AutomatonBuilder;
720
721 let tokens = tids(&[1, 2, 3, 4, 5, 6, 7, 8]);
722 let automaton = AutomatonBuilder::new().build();
723
724 let matches = get_matched_ngrams(&tokens, 0, 8, &automaton);
725
726 assert!(matches.is_empty());
727 }
728
729 #[test]
730 fn test_get_matched_ngrams_with_matches() {
731 use crate::license_detection::automaton::AutomatonBuilder;
732
733 let tokens: Vec<TokenId> = (0..30).map(tid).collect();
734 let ngram: Vec<u8> = vec![0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0];
735
736 let mut builder = AutomatonBuilder::new();
737 builder.add_pattern(&ngram);
738 let automaton = builder.build();
739
740 let matches = get_matched_ngrams(&tokens, 0, 30, &automaton);
741
742 assert!(!matches.is_empty(), "Should find ngram matches");
743
744 for (qstart, qend) in &matches {
745 assert_eq!(*qend - *qstart, UNKNOWN_NGRAM_LENGTH);
746 }
747 }
748
749 #[test]
750 fn test_get_matched_ngrams_keeps_overlapping_matches() {
751 use crate::license_detection::automaton::AutomatonBuilder;
752
753 let tokens = tids(&[1, 2, 3, 1, 2, 3, 1, 2, 3]);
754 let overlapping_ngram: Vec<u8> = tokens[..UNKNOWN_NGRAM_LENGTH]
755 .iter()
756 .flat_map(|tid| tid.to_le_bytes())
757 .collect();
758
759 let mut builder = AutomatonBuilder::new();
760 builder.add_pattern(&overlapping_ngram);
761 let automaton = builder.build();
762
763 let matches = get_matched_ngrams(&tokens, 0, tokens.len(), &automaton);
764
765 assert_eq!(matches, vec![(0, 6), (3, 9)]);
766 }
767
768 #[test]
769 fn test_calculate_score() {
770 let score1 = calculate_score(5, 10);
771 let score2 = calculate_score(10, 10);
772 let score3 = calculate_score(0, 10);
773
774 assert!(score2 > score1);
775 assert!(score2 <= MatchScore::MAX);
776 assert_eq!(score3, MatchScore::default());
777 }
778
779 #[test]
780 fn test_find_unmatched_regions_leading_unmatched() {
781 let query_len = 20;
782 let covered_positions: PositionSet = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
783 .iter()
784 .copied()
785 .collect();
786
787 let regions = find_unmatched_regions(query_len, &covered_positions);
788
789 assert_eq!(regions.len(), 1);
790 assert_eq!(regions[0], (0, 10));
791 }
792
793 #[test]
794 fn test_find_unmatched_regions_middle_gap() {
795 let query_len = 30;
796 let covered_positions: PositionSet =
797 [0, 1, 2, 3, 4, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
798 .iter()
799 .copied()
800 .collect();
801
802 let regions = find_unmatched_regions(query_len, &covered_positions);
803
804 assert_eq!(regions.len(), 1);
805 assert_eq!(regions[0], (5, 20));
806 }
807
808 #[test]
809 fn test_compute_covered_positions_gapped_qspan() {
810 let index = LicenseIndex::with_legalese_count(10);
811 let query = Query::from_extracted_text("some license text here", &index, false)
812 .expect("Failed to create query");
813
814 let known_matches = vec![LicenseMatch {
815 rid: 0,
816 license_expression: "test".to_string(),
817 license_expression_spdx: Some("TEST".to_string()),
818 from_file: None,
819 start_line: LineNumber::ONE,
820 end_line: LineNumber::ONE,
821 start_token: 0,
822 end_token: 10,
823 matcher: MatcherKind::Aho,
824 score: MatchScore::MAX,
825 matched_length: 6,
826 rule_length: 6,
827 match_coverage: 100.0,
828 rule_relevance: 100,
829 rule_identifier: "test-rule".to_string(),
830 rule_url: String::new(),
831 matched_text: Some("matched text".to_string()),
832 referenced_filenames: None,
833 rule_kind: crate::license_detection::models::RuleKind::None,
834 is_from_license: false,
835 rule_start_token: 0,
836 coordinates: MatchCoordinates::query_region(PositionSpan::from_positions(vec![
837 0, 1, 2, 7, 8, 9,
838 ])),
839 candidate_resemblance: 0.0,
840 candidate_containment: 0.0,
841 }];
842
843 let covered = compute_covered_positions(&query, &known_matches);
844
845 assert!(covered.contains(0), "Should contain position 0");
846 assert!(covered.contains(2), "Should contain position 2");
847 assert!(covered.contains(7), "Should contain position 7");
848 assert!(covered.contains(9), "Should contain position 9");
849 assert!(!covered.contains(3), "Should NOT contain position 3 (gap)");
850 assert!(!covered.contains(5), "Should NOT contain position 5 (gap)");
851 assert!(
852 !covered.contains(10),
853 "Should NOT contain position 10 (outside)"
854 );
855 }
856
857 #[test]
858 fn test_compute_covered_positions_fallback_contiguous() {
859 let index = LicenseIndex::with_legalese_count(10);
860 let query = Query::from_extracted_text("some license text here", &index, false)
861 .expect("Failed to create query");
862
863 let known_matches = vec![LicenseMatch {
864 rid: 0,
865 license_expression: "test".to_string(),
866 license_expression_spdx: Some("TEST".to_string()),
867 from_file: None,
868 start_line: LineNumber::ONE,
869 end_line: LineNumber::ONE,
870 start_token: 5,
871 end_token: 10,
872 matcher: MatcherKind::Aho,
873 score: MatchScore::MAX,
874 matched_length: 5,
875 rule_length: 5,
876 match_coverage: 100.0,
877 rule_relevance: 100,
878 rule_identifier: "test-rule".to_string(),
879 rule_url: String::new(),
880 matched_text: Some("matched text".to_string()),
881 referenced_filenames: None,
882 rule_kind: crate::license_detection::models::RuleKind::None,
883 is_from_license: false,
884 rule_start_token: 0,
885 coordinates: MatchCoordinates::query_region(PositionSpan::range(5, 10)),
886 candidate_resemblance: 0.0,
887 candidate_containment: 0.0,
888 }];
889
890 let covered = compute_covered_positions(&query, &known_matches);
891
892 assert!(covered.contains(5), "Should contain position 5");
893 assert!(covered.contains(7), "Should contain position 7");
894 assert!(covered.contains(9), "Should contain position 9");
895 assert!(
896 !covered.contains(4),
897 "Should NOT contain position 4 (before)"
898 );
899 assert!(
900 !covered.contains(10),
901 "Should NOT contain position 10 (after)"
902 );
903 }
904
905 #[test]
906 fn test_compute_covered_positions_qspan_creates_extra_unmatched_region() {
907 let index = LicenseIndex::with_legalese_count(10);
908 let query = Query::from_extracted_text("some license text here", &index, false)
909 .expect("Failed to create query");
910
911 let known_matches = vec![LicenseMatch {
912 rid: 0,
913 license_expression: "test".to_string(),
914 license_expression_spdx: Some("TEST".to_string()),
915 from_file: None,
916 start_line: LineNumber::ONE,
917 end_line: LineNumber::ONE,
918 start_token: 0,
919 end_token: 15,
920 matcher: MatcherKind::Aho,
921 score: MatchScore::MAX,
922 matched_length: 8,
923 rule_length: 8,
924 match_coverage: 100.0,
925 rule_relevance: 100,
926 rule_identifier: "test-rule".to_string(),
927 rule_url: String::new(),
928 matched_text: Some("matched text".to_string()),
929 referenced_filenames: None,
930 rule_kind: crate::license_detection::models::RuleKind::None,
931 is_from_license: false,
932 rule_start_token: 0,
933 coordinates: MatchCoordinates::query_region(PositionSpan::from_positions(vec![
934 0, 1, 2, 3, 11, 12, 13, 14,
935 ])),
936 candidate_resemblance: 0.0,
937 candidate_containment: 0.0,
938 }];
939
940 let covered = compute_covered_positions(&query, &known_matches);
941 let regions = find_unmatched_regions(20, &covered);
942
943 assert!(
944 regions.contains(&(4, 11)),
945 "Should have unmatched region 4-11 (the gap in qspan_positions), got: {:?}",
946 regions
947 );
948 assert!(
949 regions.contains(&(15, 20)),
950 "Should have trailing unmatched region 15-20, got: {:?}",
951 regions
952 );
953
954 let contiguous_covered: PositionSet = (0..15).collect();
955 let contiguous_regions = find_unmatched_regions(20, &contiguous_covered);
956 assert_eq!(
957 contiguous_regions,
958 vec![(15, 20)],
959 "Contiguous coverage would collapse the gap, producing only trailing region"
960 );
961 }
962
963 #[test]
964 fn test_create_unknown_match_from_qspan_valid() {
965 use crate::license_detection::test_utils::create_mock_query_with_tokens;
966
967 let index = LicenseIndex::with_legalese_count(10);
968
969 let tokens: Vec<u16> = (0..30).collect();
970 let query = create_mock_query_with_tokens(&tokens, &index);
971
972 let qspan: PositionSet = (0..30).collect();
973
974 let match_result = create_unknown_match_from_qspan(&query, &qspan);
975
976 assert!(
977 match_result.is_some(),
978 "Should create unknown match for sufficient length"
979 );
980
981 let m = match_result.unwrap();
982 assert_eq!(m.license_expression, "unknown");
983 assert_eq!(m.matcher, MATCH_UNKNOWN);
984 assert!(!m.coordinates.query_span().is_empty());
985 }
986
987 #[test]
988 fn test_unknown_match_with_known_matches() {
989 let index = LicenseIndex::with_legalese_count(10);
990 let text = "some text that is license related and should be detected";
991 let query =
992 Query::from_extracted_text(text, &index, false).expect("Failed to create query");
993
994 let known_matches = vec![LicenseMatch {
995 rid: 0,
996 license_expression: "mit".to_string(),
997 license_expression_spdx: Some("MIT".to_string()),
998 from_file: None,
999 start_line: LineNumber::ONE,
1000 end_line: LineNumber::ONE,
1001 start_token: 0,
1002 end_token: 5,
1003 matcher: MatcherKind::Aho,
1004 score: MatchScore::MAX,
1005 matched_length: 5,
1006 rule_length: 5,
1007 match_coverage: 100.0,
1008 rule_relevance: 100,
1009 rule_identifier: "test-rule".to_string(),
1010 rule_url: String::new(),
1011 matched_text: Some("some text".to_string()),
1012 referenced_filenames: None,
1013 rule_kind: crate::license_detection::models::RuleKind::None,
1014 is_from_license: false,
1015 rule_start_token: 0,
1016 coordinates: MatchCoordinates::query_region(PositionSpan::range(0, 5)),
1017 candidate_resemblance: 0.0,
1018 candidate_containment: 0.0,
1019 }];
1020
1021 let matches = unknown_match(&index, &query, &known_matches);
1022
1023 assert!(
1024 matches.is_empty() || matches[0].start_line > LineNumber::ONE,
1025 "Should not re-detect known regions"
1026 );
1027 }
1028
1029 #[test]
1030 fn test_calculate_score_edge_cases() {
1031 let score_zero_length = calculate_score(10, 0);
1032 assert_eq!(
1033 score_zero_length,
1034 MatchScore::default(),
1035 "Zero length should have zero score"
1036 );
1037
1038 let score_zero_ngrams = calculate_score(0, 100);
1039 assert_eq!(
1040 score_zero_ngrams,
1041 MatchScore::default(),
1042 "Zero ngrams should have zero score"
1043 );
1044
1045 let score_high_density = calculate_score(100, 50);
1046 assert_eq!(
1047 score_high_density,
1048 MatchScore::MAX,
1049 "High density should be capped at 100.0"
1050 );
1051 }
1052
1053 #[test]
1054 fn test_get_matched_ngrams_out_of_bounds() {
1055 use crate::license_detection::automaton::AutomatonBuilder;
1056
1057 let tokens = tids(&[1, 2, 3]);
1058 let automaton = AutomatonBuilder::new().build();
1059
1060 let matches = get_matched_ngrams(&tokens, 5, 10, &automaton);
1061 assert!(matches.is_empty(), "Out of bounds should return empty");
1062
1063 let matches = get_matched_ngrams(&tokens, 2, 1, &automaton);
1064 assert!(matches.is_empty(), "Invalid range should return empty");
1065 }
1066}