1mod candidates;
20mod matching;
21
22#[cfg(test)]
23mod gfdl_debug_test;
24
25pub(crate) use candidates::{select_seq_candidates, select_seq_candidates_with_deadline};
26pub(crate) use matching::{seq_match_with_candidates, seq_match_with_candidates_and_deadline};
27
28use crate::license_detection::models::MatcherKind;
29
30pub const MATCH_SEQ: MatcherKind = MatcherKind::Seq;
31
32pub const HIGH_RESEMBLANCE_THRESHOLD_TENTHS: u32 = 8;
33
34#[cfg(test)]
35pub const HIGH_RESEMBLANCE_THRESHOLD: f32 = HIGH_RESEMBLANCE_THRESHOLD_TENTHS as f32 / 10.0;
36
37pub const MAX_NEAR_DUPE_CANDIDATES: usize = 10;
39
40#[cfg(test)]
41mod tests {
42 use super::*;
43 use crate::license_detection::index::IndexedRuleMetadata;
44 use crate::license_detection::index::LicenseIndex;
45 use crate::license_detection::index::dictionary::{TokenId, TokenKind};
46 use crate::license_detection::models::Rule;
47 use crate::license_detection::query::Query;
48 use crate::license_detection::test_utils::create_test_index;
49 use crate::license_detection::{TokenMultiset, TokenSet};
50 use crate::models::LineNumber;
51 use std::collections::HashMap;
52
53 pub(super) fn create_seq_match_test_index() -> LicenseIndex {
54 create_test_index(
55 &[
56 ("license", 0),
57 ("copyright", 1),
58 ("permission", 2),
59 ("redistribute", 3),
60 ("granted", 4),
61 ],
62 5,
63 )
64 }
65
66 pub(super) fn add_test_rule(index: &mut LicenseIndex, text: &str, expression: &str) -> usize {
67 let rid = index.rules_by_rid.len();
68 let tokens: Vec<TokenId> = text
69 .split_whitespace()
70 .filter_map(|word| index.dictionary.get(word))
71 .collect();
72
73 let set = TokenSet::from_token_ids(tokens.iter().copied());
74 let mset = TokenMultiset::from_token_ids(&tokens);
75 let _ = index.sets_by_rid.insert(rid, set.clone());
76 let _ = index.msets_by_rid.insert(rid, mset);
77
78 let high_set: TokenSet =
79 TokenSet::from_u16_iter(set.iter().filter(|&tid| {
80 index.dictionary.token_kind(TokenId::new(tid)) == TokenKind::Legalese
81 }));
82 if !high_set.is_empty() {
83 let _ = index.high_sets_by_rid.insert(rid, high_set);
84 }
85
86 let mut high_postings: HashMap<TokenId, Vec<usize>> = HashMap::new();
87 for (pos, &tid) in tokens.iter().enumerate() {
88 if index.dictionary.token_kind(tid) == TokenKind::Legalese {
89 high_postings.entry(tid).or_default().push(pos);
90 }
91 }
92 let _ = index.high_postings_by_rid.insert(rid, high_postings);
93
94 let rule = Rule {
95 identifier: format!("{}.test", expression),
96 license_expression: expression.to_string(),
97 text: text.to_string(),
98 tokens: tokens.clone(),
99 rule_kind: crate::license_detection::models::RuleKind::Text,
100 is_false_positive: false,
101 is_required_phrase: false,
102 is_from_license: false,
103 relevance: 100,
104 minimum_coverage: None,
105 has_stored_minimum_coverage: false,
106 is_continuous: true,
107 referenced_filenames: None,
108 ignorable_urls: None,
109 ignorable_emails: None,
110 ignorable_copyrights: None,
111 ignorable_holders: None,
112 ignorable_authors: None,
113 language: None,
114 notes: None,
115 length_unique: tokens.len(),
116 high_length_unique: tokens
117 .iter()
118 .filter(|&&t| index.dictionary.token_kind(t) == TokenKind::Legalese)
119 .count(),
120 high_length: tokens.len(),
121 min_matched_length: 1,
122 min_high_matched_length: 1,
123 min_matched_length_unique: 1,
124 min_high_matched_length_unique: 1,
125 is_small: false,
126 is_tiny: false,
127 starts_with_license: false,
128 ends_with_license: false,
129 is_deprecated: false,
130 spdx_license_key: None,
131 other_spdx_license_keys: vec![],
132 required_phrase_spans: vec![],
133 stopwords_by_pos: std::collections::HashMap::new(),
134 };
135
136 index.rules_by_rid.push(rule.clone());
137 index.tids_by_rid.push(tokens.clone());
138 index.approx_matchable_rids.insert(rid);
139
140 for &tid in &tokens {
142 if index.dictionary.token_kind(tid) == TokenKind::Legalese {
143 index.rids_by_high_tid.entry(tid).or_default().insert(rid);
144 }
145 }
146
147 rid
148 }
149
150 #[test]
151 fn test_seq_match_basic() {
152 let mut index = create_seq_match_test_index();
153
154 add_test_rule(&mut index, "license copyright granted", "test-license");
155
156 let text = "license copyright granted here";
157 let query = Query::from_extracted_text(text, &index, false).unwrap();
158 let query_run = query.whole_query_run();
159
160 let candidates = select_seq_candidates(&index, &query_run, false, 50);
161 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
162
163 assert!(!matches.is_empty());
164 assert_eq!(matches[0].matcher, MATCH_SEQ);
165 }
166
167 #[test]
168 fn test_seq_match_uses_precomputed_spdx_expression() {
169 let mut index = create_seq_match_test_index();
170
171 add_test_rule(&mut index, "license copyright", "mit");
172 index.rule_metadata_by_identifier.insert(
173 "mit.test".to_string(),
174 IndexedRuleMetadata {
175 license_expression_spdx: Some("MIT".to_string()),
176 skip_for_required_phrase_generation: false,
177 replaced_by: vec![],
178 },
179 );
180
181 let text = "license copyright";
182 let query = Query::from_extracted_text(text, &index, false).unwrap();
183 let query_run = query.whole_query_run();
184 let candidates = select_seq_candidates(&index, &query_run, false, 50);
185 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
186
187 assert_eq!(matches[0].license_expression_spdx.as_deref(), Some("MIT"));
188 }
189
190 #[test]
191 fn test_seq_match_partial_coverage_not_filtered() {
192 let mut index = create_seq_match_test_index();
193
194 add_test_rule(
195 &mut index,
196 "license copyright granted permission redistribute",
197 "test-long-license",
198 );
199
200 let text = "license copyright";
201 let query = Query::from_extracted_text(text, &index, false).unwrap();
202 let query_run = query.whole_query_run();
203
204 let candidates = select_seq_candidates(&index, &query_run, false, 50);
205 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
206
207 assert!(
208 !matches.is_empty(),
209 "Partial coverage matches should NOT be filtered (Python has no 50% coverage filter)"
210 );
211 assert!(matches[0].match_coverage < 50.0);
212 }
213
214 #[test]
215 fn test_seq_match_empty_query() {
216 let mut index = create_seq_match_test_index();
217
218 add_test_rule(&mut index, "license copyright", "test-license");
219
220 let text = "";
221 let query = Query::from_extracted_text(text, &index, false).unwrap();
222 let query_run = query.whole_query_run();
223
224 let candidates = select_seq_candidates(&index, &query_run, false, 50);
225 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
226
227 assert!(matches.is_empty());
228 }
229
230 #[test]
231 fn test_seq_match_constants() {
232 assert_eq!(MATCH_SEQ.as_str(), "3-seq");
233 assert_eq!(MATCH_SEQ.precedence(), 3);
234 }
235
236 #[test]
237 fn test_seq_match_with_no_legalese_intersection() {
238 let mut index = create_test_index(&[("word1", 10), ("word2", 11), ("word3", 12)], 5);
239
240 add_test_rule(&mut index, "word1 word2 word3", "test-license");
241
242 let text = "word1 word2 word3";
243 let query = Query::from_extracted_text(text, &index, false).unwrap();
244 let query_run = query.whole_query_run();
245
246 let candidates = select_seq_candidates(&index, &query_run, false, 50);
247 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
248
249 assert!(
250 matches.is_empty(),
251 "Should not match when tokens are not legalese (above len_legalese)"
252 );
253 }
254
255 #[test]
256 fn test_seq_match_multiple_occurrences() {
257 let mut index = create_seq_match_test_index();
258
259 add_test_rule(&mut index, "license copyright granted", "test-license");
260
261 let text = "license copyright granted some text license copyright granted more text";
262 let query = Query::from_extracted_text(text, &index, false).unwrap();
263 let query_run = query.whole_query_run();
264
265 let candidates = select_seq_candidates(&index, &query_run, false, 50);
266 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
267
268 assert!(
269 matches.len() >= 2,
270 "Should find multiple matches for the same rule appearing multiple times in query, got {} matches",
271 matches.len()
272 );
273
274 let license_expressions: Vec<&str> = matches
275 .iter()
276 .map(|m| m.license_expression.as_str())
277 .collect();
278 assert!(
279 license_expressions.iter().all(|&e| e == "test-license"),
280 "All matches should be for test-license"
281 );
282
283 let start_lines: Vec<usize> = matches.iter().map(|m| m.start_line.get()).collect();
284 let end_lines: Vec<usize> = matches.iter().map(|m| m.end_line.get()).collect();
285
286 assert!(
287 start_lines.iter().all(|&l| l >= 1),
288 "Start lines should be valid"
289 );
290 assert!(
291 end_lines.iter().all(|&l| l >= 1),
292 "End lines should be valid"
293 );
294 }
295
296 #[test]
297 fn test_seq_match_line_numbers_accurate() {
298 let mut index = create_seq_match_test_index();
299
300 add_test_rule(&mut index, "license copyright granted", "test-license");
301
302 let text = "line one\nlicense copyright granted\nline three";
303 let query = Query::from_extracted_text(text, &index, false).unwrap();
304 let query_run = query.whole_query_run();
305
306 let candidates = select_seq_candidates(&index, &query_run, false, 50);
307 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
308
309 assert!(!matches.is_empty(), "Should find matches");
310
311 let first_match = &matches[0];
312
313 assert_eq!(
314 first_match.start_line,
315 LineNumber::new(2).unwrap(),
316 "Match should start on line 2 (where license tokens are), not line 1"
317 );
318 assert_eq!(
319 first_match.end_line,
320 LineNumber::new(2).unwrap(),
321 "Match should end on line 2 (where license tokens are), not line 3"
322 );
323
324 assert!(
326 first_match.matched_text.is_none(),
327 "matched_text should be None during matching (computed lazily at output)"
328 );
329
330 let matched_text =
332 query.matched_text(first_match.start_line.get(), first_match.end_line.get());
333 assert!(
334 matched_text.contains("license"),
335 "Computed matched text should contain 'license'"
336 );
337 }
338
339 #[test]
340 fn test_seq_match_line_numbers_partial_match() {
341 let mut index = create_seq_match_test_index();
342
343 add_test_rule(
344 &mut index,
345 "license copyright granted permission",
346 "test-license",
347 );
348
349 let text = "line one\nlicense copyright\nline three";
350 let query = Query::from_extracted_text(text, &index, false).unwrap();
351 let query_run = query.whole_query_run();
352
353 let candidates = select_seq_candidates(&index, &query_run, false, 50);
354 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
355
356 assert!(!matches.is_empty(), "Should find partial matches");
357
358 let first_match = &matches[0];
359
360 assert_eq!(
361 first_match.start_line,
362 LineNumber::new(2).unwrap(),
363 "Partial match should start on line 2"
364 );
365 assert_eq!(
366 first_match.end_line,
367 LineNumber::new(2).unwrap(),
368 "Partial match should end on line 2"
369 );
370
371 assert!(
372 first_match.match_coverage < 100.0,
373 "Should be partial coverage"
374 );
375 }
376}