1mod candidates;
20mod matching;
21
22#[cfg(test)]
23mod gfdl_debug_test;
24
25pub(crate) use candidates::select_seq_candidates;
26pub(crate) use matching::seq_match_with_candidates;
27
28use crate::license_detection::models::MatcherKind;
29
30pub const MATCH_SEQ: MatcherKind = MatcherKind::Seq;
31
32pub const HIGH_RESEMBLANCE_THRESHOLD: f32 = 0.8;
34
35pub const MAX_NEAR_DUPE_CANDIDATES: usize = 10;
37
38#[cfg(test)]
39mod tests {
40 use super::*;
41 use crate::license_detection::index::IndexedRuleMetadata;
42 use crate::license_detection::index::LicenseIndex;
43 use crate::license_detection::index::dictionary::{TokenId, TokenKind};
44 use crate::license_detection::models::Rule;
45 use crate::license_detection::query::Query;
46 use crate::license_detection::test_utils::create_test_index;
47 use crate::license_detection::{TokenMultiset, TokenSet};
48 use std::collections::HashMap;
49
50 pub(super) fn create_seq_match_test_index() -> LicenseIndex {
51 create_test_index(
52 &[
53 ("license", 0),
54 ("copyright", 1),
55 ("permission", 2),
56 ("redistribute", 3),
57 ("granted", 4),
58 ],
59 5,
60 )
61 }
62
63 pub(super) fn add_test_rule(index: &mut LicenseIndex, text: &str, expression: &str) -> usize {
64 let rid = index.rules_by_rid.len();
65 let tokens: Vec<TokenId> = text
66 .split_whitespace()
67 .filter_map(|word| index.dictionary.get(word))
68 .collect();
69
70 let set = TokenSet::from_token_ids(tokens.iter().copied());
71 let mset = TokenMultiset::from_token_ids(&tokens);
72 let _ = index.sets_by_rid.insert(rid, set.clone());
73 let _ = index.msets_by_rid.insert(rid, mset);
74
75 let high_set: TokenSet =
76 TokenSet::from_u16_iter(set.iter().filter(|&tid| {
77 index.dictionary.token_kind(TokenId::new(tid)) == TokenKind::Legalese
78 }));
79 if !high_set.is_empty() {
80 let _ = index.high_sets_by_rid.insert(rid, high_set);
81 }
82
83 let mut high_postings: HashMap<TokenId, Vec<usize>> = HashMap::new();
84 for (pos, &tid) in tokens.iter().enumerate() {
85 if index.dictionary.token_kind(tid) == TokenKind::Legalese {
86 high_postings.entry(tid).or_default().push(pos);
87 }
88 }
89 let _ = index.high_postings_by_rid.insert(rid, high_postings);
90
91 let rule = Rule {
92 identifier: format!("{}.test", expression),
93 license_expression: expression.to_string(),
94 text: text.to_string(),
95 tokens: tokens.clone(),
96 rule_kind: crate::license_detection::models::RuleKind::Text,
97 is_false_positive: false,
98 is_required_phrase: false,
99 is_from_license: false,
100 relevance: 100,
101 minimum_coverage: None,
102 has_stored_minimum_coverage: false,
103 is_continuous: true,
104 referenced_filenames: None,
105 ignorable_urls: None,
106 ignorable_emails: None,
107 ignorable_copyrights: None,
108 ignorable_holders: None,
109 ignorable_authors: None,
110 language: None,
111 notes: None,
112 length_unique: tokens.len(),
113 high_length_unique: tokens
114 .iter()
115 .filter(|&&t| index.dictionary.token_kind(t) == TokenKind::Legalese)
116 .count(),
117 high_length: tokens.len(),
118 min_matched_length: 1,
119 min_high_matched_length: 1,
120 min_matched_length_unique: 1,
121 min_high_matched_length_unique: 1,
122 is_small: false,
123 is_tiny: false,
124 starts_with_license: false,
125 ends_with_license: false,
126 is_deprecated: false,
127 spdx_license_key: None,
128 other_spdx_license_keys: vec![],
129 required_phrase_spans: vec![],
130 stopwords_by_pos: std::collections::HashMap::new(),
131 };
132
133 index.rules_by_rid.push(rule.clone());
134 index.tids_by_rid.push(tokens.clone());
135 index.approx_matchable_rids.insert(rid);
136
137 for &tid in &tokens {
139 if index.dictionary.token_kind(tid) == TokenKind::Legalese {
140 index.rids_by_high_tid.entry(tid).or_default().insert(rid);
141 }
142 }
143
144 rid
145 }
146
147 #[test]
148 fn test_seq_match_basic() {
149 let mut index = create_seq_match_test_index();
150
151 add_test_rule(&mut index, "license copyright granted", "test-license");
152
153 let text = "license copyright granted here";
154 let query = Query::from_extracted_text(text, &index, false).unwrap();
155 let query_run = query.whole_query_run();
156
157 let candidates = select_seq_candidates(&index, &query_run, false, 50);
158 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
159
160 assert!(!matches.is_empty());
161 assert_eq!(matches[0].matcher, MATCH_SEQ);
162 }
163
164 #[test]
165 fn test_seq_match_uses_precomputed_spdx_expression() {
166 let mut index = create_seq_match_test_index();
167
168 add_test_rule(&mut index, "license copyright", "mit");
169 index.rule_metadata_by_identifier.insert(
170 "mit.test".to_string(),
171 IndexedRuleMetadata {
172 license_expression_spdx: Some("MIT".to_string()),
173 skip_for_required_phrase_generation: false,
174 replaced_by: vec![],
175 },
176 );
177
178 let text = "license copyright";
179 let query = Query::from_extracted_text(text, &index, false).unwrap();
180 let query_run = query.whole_query_run();
181 let candidates = select_seq_candidates(&index, &query_run, false, 50);
182 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
183
184 assert_eq!(matches[0].license_expression_spdx.as_deref(), Some("MIT"));
185 }
186
187 #[test]
188 fn test_seq_match_partial_coverage_not_filtered() {
189 let mut index = create_seq_match_test_index();
190
191 add_test_rule(
192 &mut index,
193 "license copyright granted permission redistribute",
194 "test-long-license",
195 );
196
197 let text = "license copyright";
198 let query = Query::from_extracted_text(text, &index, false).unwrap();
199 let query_run = query.whole_query_run();
200
201 let candidates = select_seq_candidates(&index, &query_run, false, 50);
202 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
203
204 assert!(
205 !matches.is_empty(),
206 "Partial coverage matches should NOT be filtered (Python has no 50% coverage filter)"
207 );
208 assert!(matches[0].match_coverage < 50.0);
209 }
210
211 #[test]
212 fn test_seq_match_empty_query() {
213 let mut index = create_seq_match_test_index();
214
215 add_test_rule(&mut index, "license copyright", "test-license");
216
217 let text = "";
218 let query = Query::from_extracted_text(text, &index, false).unwrap();
219 let query_run = query.whole_query_run();
220
221 let candidates = select_seq_candidates(&index, &query_run, false, 50);
222 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
223
224 assert!(matches.is_empty());
225 }
226
227 #[test]
228 fn test_seq_match_constants() {
229 assert_eq!(MATCH_SEQ.as_str(), "3-seq");
230 assert_eq!(MATCH_SEQ.precedence(), 3);
231 }
232
233 #[test]
234 fn test_seq_match_with_no_legalese_intersection() {
235 let mut index = create_test_index(&[("word1", 10), ("word2", 11), ("word3", 12)], 5);
236
237 add_test_rule(&mut index, "word1 word2 word3", "test-license");
238
239 let text = "word1 word2 word3";
240 let query = Query::from_extracted_text(text, &index, false).unwrap();
241 let query_run = query.whole_query_run();
242
243 let candidates = select_seq_candidates(&index, &query_run, false, 50);
244 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
245
246 assert!(
247 matches.is_empty(),
248 "Should not match when tokens are not legalese (above len_legalese)"
249 );
250 }
251
252 #[test]
253 fn test_seq_match_multiple_occurrences() {
254 let mut index = create_seq_match_test_index();
255
256 add_test_rule(&mut index, "license copyright granted", "test-license");
257
258 let text = "license copyright granted some text license copyright granted more text";
259 let query = Query::from_extracted_text(text, &index, false).unwrap();
260 let query_run = query.whole_query_run();
261
262 let candidates = select_seq_candidates(&index, &query_run, false, 50);
263 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
264
265 assert!(
266 matches.len() >= 2,
267 "Should find multiple matches for the same rule appearing multiple times in query, got {} matches",
268 matches.len()
269 );
270
271 let license_expressions: Vec<&str> = matches
272 .iter()
273 .map(|m| m.license_expression.as_str())
274 .collect();
275 assert!(
276 license_expressions.iter().all(|&e| e == "test-license"),
277 "All matches should be for test-license"
278 );
279
280 let start_lines: Vec<usize> = matches.iter().map(|m| m.start_line).collect();
281 let end_lines: Vec<usize> = matches.iter().map(|m| m.end_line).collect();
282
283 assert!(
284 start_lines.iter().all(|&l| l >= 1),
285 "Start lines should be valid"
286 );
287 assert!(
288 end_lines.iter().all(|&l| l >= 1),
289 "End lines should be valid"
290 );
291 }
292
293 #[test]
294 fn test_seq_match_line_numbers_accurate() {
295 let mut index = create_seq_match_test_index();
296
297 add_test_rule(&mut index, "license copyright granted", "test-license");
298
299 let text = "line one\nlicense copyright granted\nline three";
300 let query = Query::from_extracted_text(text, &index, false).unwrap();
301 let query_run = query.whole_query_run();
302
303 let candidates = select_seq_candidates(&index, &query_run, false, 50);
304 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
305
306 assert!(!matches.is_empty(), "Should find matches");
307
308 let first_match = &matches[0];
309
310 assert_eq!(
311 first_match.start_line, 2,
312 "Match should start on line 2 (where license tokens are), not line 1"
313 );
314 assert_eq!(
315 first_match.end_line, 2,
316 "Match should end on line 2 (where license tokens are), not line 3"
317 );
318
319 assert!(
321 first_match.matched_text.is_none(),
322 "matched_text should be None during matching (computed lazily at output)"
323 );
324
325 let matched_text = query.matched_text(first_match.start_line, first_match.end_line);
327 assert!(
328 matched_text.contains("license"),
329 "Computed matched text should contain 'license'"
330 );
331 }
332
333 #[test]
334 fn test_seq_match_line_numbers_partial_match() {
335 let mut index = create_seq_match_test_index();
336
337 add_test_rule(
338 &mut index,
339 "license copyright granted permission",
340 "test-license",
341 );
342
343 let text = "line one\nlicense copyright\nline three";
344 let query = Query::from_extracted_text(text, &index, false).unwrap();
345 let query_run = query.whole_query_run();
346
347 let candidates = select_seq_candidates(&index, &query_run, false, 50);
348 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
349
350 assert!(!matches.is_empty(), "Should find partial matches");
351
352 let first_match = &matches[0];
353
354 assert_eq!(
355 first_match.start_line, 2,
356 "Partial match should start on line 2"
357 );
358 assert_eq!(
359 first_match.end_line, 2,
360 "Partial match should end on line 2"
361 );
362
363 assert!(
364 first_match.match_coverage < 100.0,
365 "Should be partial coverage"
366 );
367 }
368}