1mod candidates;
20mod matching;
21
22#[cfg(test)]
23mod gfdl_debug_test;
24
25pub use candidates::{Candidate, compute_candidates_with_msets};
26pub use matching::seq_match_with_candidates;
27
28use crate::license_detection::models::MatcherKind;
29
30pub const MATCH_SEQ: MatcherKind = MatcherKind::Seq;
31
32pub const HIGH_RESEMBLANCE_THRESHOLD: f32 = 0.8;
34
35pub const MAX_NEAR_DUPE_CANDIDATES: usize = 10;
37
38#[cfg(test)]
39mod tests {
40 use super::*;
41 use crate::license_detection::index::IndexedRuleMetadata;
42 use crate::license_detection::index::LicenseIndex;
43 use crate::license_detection::index::dictionary::{TokenId, TokenKind};
44 use crate::license_detection::index::token_sets::build_set_and_mset;
45 use crate::license_detection::models::Rule;
46 use crate::license_detection::query::Query;
47 use crate::license_detection::test_utils::create_test_index;
48 use crate::license_detection::token_set::TokenSet;
49 use std::collections::HashMap;
50
51 pub(super) fn create_seq_match_test_index() -> LicenseIndex {
52 create_test_index(
53 &[
54 ("license", 0),
55 ("copyright", 1),
56 ("permission", 2),
57 ("redistribute", 3),
58 ("granted", 4),
59 ],
60 5,
61 )
62 }
63
64 pub(super) fn add_test_rule(index: &mut LicenseIndex, text: &str, expression: &str) -> usize {
65 let rid = index.rules_by_rid.len();
66 let tokens: Vec<TokenId> = text
67 .split_whitespace()
68 .filter_map(|word| index.dictionary.get(word))
69 .collect();
70
71 let (set, mset) = build_set_and_mset(&tokens);
72 let token_set: TokenSet = TokenSet::from_u16_iter(set.iter().map(|tid| tid.raw()));
73 let _ = index.sets_by_rid.insert(rid, token_set);
74 let _ = index.msets_by_rid.insert(rid, mset);
75
76 let high_set: TokenSet = TokenSet::from_u16_iter(
77 set.iter()
78 .filter(|&&tid| index.dictionary.token_kind(tid) == TokenKind::Legalese)
79 .map(|tid| tid.raw()),
80 );
81 if !high_set.is_empty() {
82 let _ = index.high_sets_by_rid.insert(rid, high_set);
83 }
84
85 let mut high_postings: HashMap<TokenId, Vec<usize>> = HashMap::new();
86 for (pos, &tid) in tokens.iter().enumerate() {
87 if index.dictionary.token_kind(tid) == TokenKind::Legalese {
88 high_postings.entry(tid).or_default().push(pos);
89 }
90 }
91 let _ = index.high_postings_by_rid.insert(rid, high_postings);
92
93 let rule = Rule {
94 identifier: format!("{}.test", expression),
95 license_expression: expression.to_string(),
96 text: text.to_string(),
97 tokens: tokens.clone(),
98 rule_kind: crate::license_detection::models::RuleKind::Text,
99 is_false_positive: false,
100 is_required_phrase: false,
101 is_from_license: false,
102 relevance: 100,
103 minimum_coverage: None,
104 has_stored_minimum_coverage: false,
105 is_continuous: true,
106 referenced_filenames: None,
107 ignorable_urls: None,
108 ignorable_emails: None,
109 ignorable_copyrights: None,
110 ignorable_holders: None,
111 ignorable_authors: None,
112 language: None,
113 notes: None,
114 length_unique: tokens.len(),
115 high_length_unique: tokens
116 .iter()
117 .filter(|&&t| index.dictionary.token_kind(t) == TokenKind::Legalese)
118 .count(),
119 high_length: tokens.len(),
120 min_matched_length: 1,
121 min_high_matched_length: 1,
122 min_matched_length_unique: 1,
123 min_high_matched_length_unique: 1,
124 is_small: false,
125 is_tiny: false,
126 starts_with_license: false,
127 ends_with_license: false,
128 is_deprecated: false,
129 spdx_license_key: None,
130 other_spdx_license_keys: vec![],
131 required_phrase_spans: vec![],
132 stopwords_by_pos: std::collections::HashMap::new(),
133 };
134
135 index.rules_by_rid.push(rule.clone());
136 index.tids_by_rid.push(tokens.clone());
137 index.approx_matchable_rids.insert(rid);
138
139 for &tid in &tokens {
141 if index.dictionary.token_kind(tid) == TokenKind::Legalese {
142 index.rids_by_high_tid.entry(tid).or_default().insert(rid);
143 }
144 }
145
146 rid
147 }
148
149 #[test]
150 fn test_seq_match_basic() {
151 let mut index = create_seq_match_test_index();
152
153 add_test_rule(&mut index, "license copyright granted", "test-license");
154
155 let text = "license copyright granted here";
156 let query = Query::from_extracted_text(text, &index, false).unwrap();
157 let query_run = query.whole_query_run();
158
159 let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
160 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
161
162 assert!(!matches.is_empty());
163 assert_eq!(matches[0].matcher, MATCH_SEQ);
164 }
165
166 #[test]
167 fn test_seq_match_uses_precomputed_spdx_expression() {
168 let mut index = create_seq_match_test_index();
169
170 add_test_rule(&mut index, "license copyright", "mit");
171 index.rule_metadata_by_identifier.insert(
172 "mit.test".to_string(),
173 IndexedRuleMetadata {
174 license_expression_spdx: Some("MIT".to_string()),
175 skip_for_required_phrase_generation: false,
176 replaced_by: vec![],
177 },
178 );
179
180 let text = "license copyright";
181 let query = Query::from_extracted_text(text, &index, false).unwrap();
182 let query_run = query.whole_query_run();
183 let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
184 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
185
186 assert_eq!(matches[0].license_expression_spdx.as_deref(), Some("MIT"));
187 }
188
189 #[test]
190 fn test_seq_match_partial_coverage_not_filtered() {
191 let mut index = create_seq_match_test_index();
192
193 add_test_rule(
194 &mut index,
195 "license copyright granted permission redistribute",
196 "test-long-license",
197 );
198
199 let text = "license copyright";
200 let query = Query::from_extracted_text(text, &index, false).unwrap();
201 let query_run = query.whole_query_run();
202
203 let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
204 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
205
206 assert!(
207 !matches.is_empty(),
208 "Partial coverage matches should NOT be filtered (Python has no 50% coverage filter)"
209 );
210 assert!(matches[0].match_coverage < 50.0);
211 }
212
213 #[test]
214 fn test_seq_match_empty_query() {
215 let mut index = create_seq_match_test_index();
216
217 add_test_rule(&mut index, "license copyright", "test-license");
218
219 let text = "";
220 let query = Query::from_extracted_text(text, &index, false).unwrap();
221 let query_run = query.whole_query_run();
222
223 let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
224 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
225
226 assert!(matches.is_empty());
227 }
228
229 #[test]
230 fn test_seq_match_constants() {
231 assert_eq!(MATCH_SEQ.as_str(), "3-seq");
232 assert_eq!(MATCH_SEQ.precedence(), 3);
233 }
234
235 #[test]
236 fn test_seq_match_with_no_legalese_intersection() {
237 let mut index = create_test_index(&[("word1", 10), ("word2", 11), ("word3", 12)], 5);
238
239 add_test_rule(&mut index, "word1 word2 word3", "test-license");
240
241 let text = "word1 word2 word3";
242 let query = Query::from_extracted_text(text, &index, false).unwrap();
243 let query_run = query.whole_query_run();
244
245 let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
246 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
247
248 assert!(
249 matches.is_empty(),
250 "Should not match when tokens are not legalese (above len_legalese)"
251 );
252 }
253
254 #[test]
255 fn test_seq_match_multiple_occurrences() {
256 let mut index = create_seq_match_test_index();
257
258 add_test_rule(&mut index, "license copyright granted", "test-license");
259
260 let text = "license copyright granted some text license copyright granted more text";
261 let query = Query::from_extracted_text(text, &index, false).unwrap();
262 let query_run = query.whole_query_run();
263
264 let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
265 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
266
267 assert!(
268 matches.len() >= 2,
269 "Should find multiple matches for the same rule appearing multiple times in query, got {} matches",
270 matches.len()
271 );
272
273 let license_expressions: Vec<&str> = matches
274 .iter()
275 .map(|m| m.license_expression.as_str())
276 .collect();
277 assert!(
278 license_expressions.iter().all(|&e| e == "test-license"),
279 "All matches should be for test-license"
280 );
281
282 let start_lines: Vec<usize> = matches.iter().map(|m| m.start_line).collect();
283 let end_lines: Vec<usize> = matches.iter().map(|m| m.end_line).collect();
284
285 assert!(
286 start_lines.iter().all(|&l| l >= 1),
287 "Start lines should be valid"
288 );
289 assert!(
290 end_lines.iter().all(|&l| l >= 1),
291 "End lines should be valid"
292 );
293 }
294
295 #[test]
296 fn test_seq_match_line_numbers_accurate() {
297 let mut index = create_seq_match_test_index();
298
299 add_test_rule(&mut index, "license copyright granted", "test-license");
300
301 let text = "line one\nlicense copyright granted\nline three";
302 let query = Query::from_extracted_text(text, &index, false).unwrap();
303 let query_run = query.whole_query_run();
304
305 let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
306 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
307
308 assert!(!matches.is_empty(), "Should find matches");
309
310 let first_match = &matches[0];
311
312 assert_eq!(
313 first_match.start_line, 2,
314 "Match should start on line 2 (where license tokens are), not line 1"
315 );
316 assert_eq!(
317 first_match.end_line, 2,
318 "Match should end on line 2 (where license tokens are), not line 3"
319 );
320
321 assert!(
323 first_match.matched_text.is_none(),
324 "matched_text should be None during matching (computed lazily at output)"
325 );
326
327 let matched_text = query.matched_text(first_match.start_line, first_match.end_line);
329 assert!(
330 matched_text.contains("license"),
331 "Computed matched text should contain 'license'"
332 );
333 }
334
335 #[test]
336 fn test_seq_match_line_numbers_partial_match() {
337 let mut index = create_seq_match_test_index();
338
339 add_test_rule(
340 &mut index,
341 "license copyright granted permission",
342 "test-license",
343 );
344
345 let text = "line one\nlicense copyright\nline three";
346 let query = Query::from_extracted_text(text, &index, false).unwrap();
347 let query_run = query.whole_query_run();
348
349 let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
350 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
351
352 assert!(!matches.is_empty(), "Should find partial matches");
353
354 let first_match = &matches[0];
355
356 assert_eq!(
357 first_match.start_line, 2,
358 "Partial match should start on line 2"
359 );
360 assert_eq!(
361 first_match.end_line, 2,
362 "Partial match should end on line 2"
363 );
364
365 assert!(
366 first_match.match_coverage < 100.0,
367 "Should be partial coverage"
368 );
369 }
370}