1mod candidates;
23mod matching;
24
25#[cfg(test)]
26mod gfdl_debug_test;
27
28pub(crate) use candidates::{select_seq_candidates, select_seq_candidates_with_deadline};
29pub(crate) use matching::{seq_match_with_candidates, seq_match_with_candidates_and_deadline};
30
31use crate::license_detection::models::MatcherKind;
32
33pub const MATCH_SEQ: MatcherKind = MatcherKind::Seq;
34
35pub const HIGH_RESEMBLANCE_THRESHOLD_TENTHS: u32 = 8;
36
37#[cfg(test)]
38pub const HIGH_RESEMBLANCE_THRESHOLD: f32 = HIGH_RESEMBLANCE_THRESHOLD_TENTHS as f32 / 10.0;
39
40pub const MAX_NEAR_DUPE_CANDIDATES: usize = 10;
42
43#[cfg(test)]
44mod tests {
45 use super::*;
46 use crate::license_detection::index::IndexedRuleMetadata;
47 use crate::license_detection::index::LicenseIndex;
48 use crate::license_detection::index::dictionary::{TokenId, TokenKind};
49 use crate::license_detection::models::Rule;
50 use crate::license_detection::query::Query;
51 use crate::license_detection::test_utils::create_test_index;
52 use crate::license_detection::{TokenMultiset, TokenSet};
53 use crate::models::LineNumber;
54 use std::collections::HashMap;
55
56 pub(super) fn create_seq_match_test_index() -> LicenseIndex {
57 create_test_index(
58 &[
59 ("license", 0),
60 ("copyright", 1),
61 ("permission", 2),
62 ("redistribute", 3),
63 ("granted", 4),
64 ],
65 5,
66 )
67 }
68
69 pub(super) fn add_test_rule(index: &mut LicenseIndex, text: &str, expression: &str) -> usize {
70 let rid = index.rules_by_rid.len();
71 let tokens: Vec<TokenId> = text
72 .split_whitespace()
73 .filter_map(|word| index.dictionary.get(word))
74 .collect();
75
76 let set = TokenSet::from_token_ids(tokens.iter().copied());
77 let mset = TokenMultiset::from_token_ids(&tokens);
78 let _ = index.sets_by_rid.insert(rid, set.clone());
79 let _ = index.msets_by_rid.insert(rid, mset);
80
81 let high_set: TokenSet =
82 TokenSet::from_u16_iter(set.iter().filter(|&tid| {
83 index.dictionary.token_kind(TokenId::new(tid)) == TokenKind::Legalese
84 }));
85 if !high_set.is_empty() {
86 let _ = index.high_sets_by_rid.insert(rid, high_set);
87 }
88
89 let mut high_postings: HashMap<TokenId, Vec<usize>> = HashMap::new();
90 for (pos, &tid) in tokens.iter().enumerate() {
91 if index.dictionary.token_kind(tid) == TokenKind::Legalese {
92 high_postings.entry(tid).or_default().push(pos);
93 }
94 }
95 let _ = index.high_postings_by_rid.insert(rid, high_postings);
96
97 let rule = Rule {
98 identifier: format!("{}.test", expression),
99 license_expression: expression.to_string(),
100 text: text.to_string(),
101 tokens: tokens.clone(),
102 rule_kind: crate::license_detection::models::RuleKind::Text,
103 is_false_positive: false,
104 is_required_phrase: false,
105 is_from_license: false,
106 relevance: 100,
107 minimum_coverage: None,
108 has_stored_minimum_coverage: false,
109 is_continuous: true,
110 referenced_filenames: None,
111 ignorable_urls: None,
112 ignorable_emails: None,
113 ignorable_copyrights: None,
114 ignorable_holders: None,
115 ignorable_authors: None,
116 language: None,
117 notes: None,
118 length_unique: tokens.len(),
119 high_length_unique: tokens
120 .iter()
121 .filter(|&&t| index.dictionary.token_kind(t) == TokenKind::Legalese)
122 .count(),
123 high_length: tokens.len(),
124 min_matched_length: 1,
125 min_high_matched_length: 1,
126 min_matched_length_unique: 1,
127 min_high_matched_length_unique: 1,
128 is_small: false,
129 is_tiny: false,
130 starts_with_license: false,
131 ends_with_license: false,
132 is_deprecated: false,
133 spdx_license_key: None,
134 other_spdx_license_keys: vec![],
135 required_phrase_spans: vec![],
136 stopwords_by_pos: std::collections::HashMap::new(),
137 };
138
139 index.rules_by_rid.push(rule.clone());
140 index.tids_by_rid.push(tokens.clone());
141 index.approx_matchable_rids.insert(rid);
142
143 for &tid in &tokens {
145 if index.dictionary.token_kind(tid) == TokenKind::Legalese {
146 index.rids_by_high_tid.entry(tid).or_default().insert(rid);
147 }
148 }
149
150 rid
151 }
152
153 #[test]
154 fn test_seq_match_basic() {
155 let mut index = create_seq_match_test_index();
156
157 add_test_rule(&mut index, "license copyright granted", "test-license");
158
159 let text = "license copyright granted here";
160 let query = Query::from_extracted_text(text, &index, false).unwrap();
161 let query_run = query.whole_query_run();
162
163 let candidates = select_seq_candidates(&index, &query_run, false, 50);
164 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
165
166 assert!(!matches.is_empty());
167 assert_eq!(matches[0].matcher, MATCH_SEQ);
168 }
169
170 #[test]
171 fn test_seq_match_uses_precomputed_spdx_expression() {
172 let mut index = create_seq_match_test_index();
173
174 add_test_rule(&mut index, "license copyright", "mit");
175 index.rule_metadata_by_identifier.insert(
176 "mit.test".to_string(),
177 IndexedRuleMetadata {
178 license_expression_spdx: Some("MIT".to_string()),
179 skip_for_required_phrase_generation: false,
180 replaced_by: vec![],
181 },
182 );
183
184 let text = "license copyright";
185 let query = Query::from_extracted_text(text, &index, false).unwrap();
186 let query_run = query.whole_query_run();
187 let candidates = select_seq_candidates(&index, &query_run, false, 50);
188 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
189
190 assert_eq!(matches[0].license_expression_spdx.as_deref(), Some("MIT"));
191 }
192
193 #[test]
194 fn test_seq_match_partial_coverage_not_filtered() {
195 let mut index = create_seq_match_test_index();
196
197 add_test_rule(
198 &mut index,
199 "license copyright granted permission redistribute",
200 "test-long-license",
201 );
202
203 let text = "license copyright";
204 let query = Query::from_extracted_text(text, &index, false).unwrap();
205 let query_run = query.whole_query_run();
206
207 let candidates = select_seq_candidates(&index, &query_run, false, 50);
208 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
209
210 assert!(
211 !matches.is_empty(),
212 "Partial coverage matches should NOT be filtered (Python has no 50% coverage filter)"
213 );
214 assert!(matches[0].match_coverage < 50.0);
215 }
216
217 #[test]
218 fn test_seq_match_empty_query() {
219 let mut index = create_seq_match_test_index();
220
221 add_test_rule(&mut index, "license copyright", "test-license");
222
223 let text = "";
224 let query = Query::from_extracted_text(text, &index, false).unwrap();
225 let query_run = query.whole_query_run();
226
227 let candidates = select_seq_candidates(&index, &query_run, false, 50);
228 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
229
230 assert!(matches.is_empty());
231 }
232
233 #[test]
234 fn test_seq_match_constants() {
235 assert_eq!(MATCH_SEQ.as_str(), "3-seq");
236 assert_eq!(MATCH_SEQ.precedence(), 3);
237 }
238
239 #[test]
240 fn test_seq_match_with_no_legalese_intersection() {
241 let mut index = create_test_index(&[("word1", 10), ("word2", 11), ("word3", 12)], 5);
242
243 add_test_rule(&mut index, "word1 word2 word3", "test-license");
244
245 let text = "word1 word2 word3";
246 let query = Query::from_extracted_text(text, &index, false).unwrap();
247 let query_run = query.whole_query_run();
248
249 let candidates = select_seq_candidates(&index, &query_run, false, 50);
250 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
251
252 assert!(
253 matches.is_empty(),
254 "Should not match when tokens are not legalese (above len_legalese)"
255 );
256 }
257
258 #[test]
259 fn test_seq_match_multiple_occurrences() {
260 let mut index = create_seq_match_test_index();
261
262 add_test_rule(&mut index, "license copyright granted", "test-license");
263
264 let text = "license copyright granted some text license copyright granted more text";
265 let query = Query::from_extracted_text(text, &index, false).unwrap();
266 let query_run = query.whole_query_run();
267
268 let candidates = select_seq_candidates(&index, &query_run, false, 50);
269 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
270
271 assert!(
272 matches.len() >= 2,
273 "Should find multiple matches for the same rule appearing multiple times in query, got {} matches",
274 matches.len()
275 );
276
277 let license_expressions: Vec<&str> = matches
278 .iter()
279 .map(|m| m.license_expression.as_str())
280 .collect();
281 assert!(
282 license_expressions.iter().all(|&e| e == "test-license"),
283 "All matches should be for test-license"
284 );
285
286 let start_lines: Vec<usize> = matches.iter().map(|m| m.start_line.get()).collect();
287 let end_lines: Vec<usize> = matches.iter().map(|m| m.end_line.get()).collect();
288
289 assert!(
290 start_lines.iter().all(|&l| l >= 1),
291 "Start lines should be valid"
292 );
293 assert!(
294 end_lines.iter().all(|&l| l >= 1),
295 "End lines should be valid"
296 );
297 }
298
299 #[test]
300 fn test_seq_match_line_numbers_accurate() {
301 let mut index = create_seq_match_test_index();
302
303 add_test_rule(&mut index, "license copyright granted", "test-license");
304
305 let text = "line one\nlicense copyright granted\nline three";
306 let query = Query::from_extracted_text(text, &index, false).unwrap();
307 let query_run = query.whole_query_run();
308
309 let candidates = select_seq_candidates(&index, &query_run, false, 50);
310 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
311
312 assert!(!matches.is_empty(), "Should find matches");
313
314 let first_match = &matches[0];
315
316 assert_eq!(
317 first_match.start_line,
318 LineNumber::new(2).unwrap(),
319 "Match should start on line 2 (where license tokens are), not line 1"
320 );
321 assert_eq!(
322 first_match.end_line,
323 LineNumber::new(2).unwrap(),
324 "Match should end on line 2 (where license tokens are), not line 3"
325 );
326
327 assert!(
329 first_match.matched_text.is_none(),
330 "matched_text should be None during matching (computed lazily at output)"
331 );
332
333 let matched_text =
335 query.matched_text(first_match.start_line.get(), first_match.end_line.get());
336 assert!(
337 matched_text.contains("license"),
338 "Computed matched text should contain 'license'"
339 );
340 }
341
342 #[test]
343 fn test_seq_match_line_numbers_partial_match() {
344 let mut index = create_seq_match_test_index();
345
346 add_test_rule(
347 &mut index,
348 "license copyright granted permission",
349 "test-license",
350 );
351
352 let text = "line one\nlicense copyright\nline three";
353 let query = Query::from_extracted_text(text, &index, false).unwrap();
354 let query_run = query.whole_query_run();
355
356 let candidates = select_seq_candidates(&index, &query_run, false, 50);
357 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
358
359 assert!(!matches.is_empty(), "Should find partial matches");
360
361 let first_match = &matches[0];
362
363 assert_eq!(
364 first_match.start_line,
365 LineNumber::new(2).unwrap(),
366 "Partial match should start on line 2"
367 );
368 assert_eq!(
369 first_match.end_line,
370 LineNumber::new(2).unwrap(),
371 "Partial match should end on line 2"
372 );
373
374 assert!(
375 first_match.match_coverage < 100.0,
376 "Should be partial coverage"
377 );
378 }
379}