provenant/license_detection/seq_match/
mod.rs1mod candidates;
20mod matching;
21
22#[cfg(test)]
23mod gfdl_debug_test;
24
25pub use candidates::{Candidate, compute_candidates_with_msets};
26pub use matching::seq_match_with_candidates;
27
28use crate::license_detection::models::MatcherKind;
29
30pub const MATCH_SEQ: MatcherKind = MatcherKind::Seq;
31
32pub const HIGH_RESEMBLANCE_THRESHOLD: f32 = 0.8;
34
35pub const MAX_NEAR_DUPE_CANDIDATES: usize = 10;
37
38#[cfg(test)]
39mod tests {
40 use super::*;
41 use crate::license_detection::index::LicenseIndex;
42 use crate::license_detection::index::dictionary::{TokenId, TokenKind};
43 use crate::license_detection::index::token_sets::build_set_and_mset;
44 use crate::license_detection::models::Rule;
45 use crate::license_detection::query::Query;
46 use crate::license_detection::test_utils::create_test_index;
47 use std::collections::{HashMap, HashSet};
48
49 pub(super) fn create_seq_match_test_index() -> LicenseIndex {
50 create_test_index(
51 &[
52 ("license", 0),
53 ("copyright", 1),
54 ("permission", 2),
55 ("redistribute", 3),
56 ("granted", 4),
57 ],
58 5,
59 )
60 }
61
62 pub(super) fn add_test_rule(index: &mut LicenseIndex, text: &str, expression: &str) -> usize {
63 let rid = index.rules_by_rid.len();
64 let tokens: Vec<TokenId> = text
65 .split_whitespace()
66 .filter_map(|word| index.dictionary.get(word))
67 .collect();
68
69 let (set, mset) = build_set_and_mset(&tokens);
70 let _ = index.sets_by_rid.insert(rid, set.clone());
71 let _ = index.msets_by_rid.insert(rid, mset);
72
73 let high_set: HashSet<TokenId> = set
74 .iter()
75 .filter(|&&tid| index.dictionary.token_kind(tid) == TokenKind::Legalese)
76 .copied()
77 .collect();
78 if !high_set.is_empty() {
79 let _ = index.high_sets_by_rid.insert(rid, high_set);
80 }
81
82 let mut high_postings: HashMap<TokenId, Vec<usize>> = HashMap::new();
83 for (pos, &tid) in tokens.iter().enumerate() {
84 if index.dictionary.token_kind(tid) == TokenKind::Legalese {
85 high_postings.entry(tid).or_default().push(pos);
86 }
87 }
88 let _ = index.high_postings_by_rid.insert(rid, high_postings);
89
90 let rule = Rule {
91 identifier: format!("{}.test", expression),
92 license_expression: expression.to_string(),
93 text: text.to_string(),
94 tokens: tokens.clone(),
95 rule_kind: crate::license_detection::models::RuleKind::Text,
96 is_false_positive: false,
97 is_required_phrase: false,
98 is_from_license: false,
99 relevance: 100,
100 minimum_coverage: None,
101 has_stored_minimum_coverage: false,
102 is_continuous: true,
103 referenced_filenames: None,
104 ignorable_urls: None,
105 ignorable_emails: None,
106 ignorable_copyrights: None,
107 ignorable_holders: None,
108 ignorable_authors: None,
109 language: None,
110 notes: None,
111 length_unique: tokens.len(),
112 high_length_unique: tokens
113 .iter()
114 .filter(|&&t| index.dictionary.token_kind(t) == TokenKind::Legalese)
115 .count(),
116 high_length: tokens.len(),
117 min_matched_length: 1,
118 min_high_matched_length: 1,
119 min_matched_length_unique: 1,
120 min_high_matched_length_unique: 1,
121 is_small: false,
122 is_tiny: false,
123 starts_with_license: false,
124 ends_with_license: false,
125 is_deprecated: false,
126 spdx_license_key: None,
127 other_spdx_license_keys: vec![],
128 required_phrase_spans: vec![],
129 stopwords_by_pos: std::collections::HashMap::new(),
130 };
131
132 index.rules_by_rid.push(rule.clone());
133 index.tids_by_rid.push(tokens.clone());
134 index.approx_matchable_rids.insert(rid);
135
136 for &tid in &tokens {
138 if index.dictionary.token_kind(tid) == TokenKind::Legalese {
139 index.rids_by_high_tid.entry(tid).or_default().insert(rid);
140 }
141 }
142
143 rid
144 }
145
146 #[test]
147 fn test_seq_match_basic() {
148 let mut index = create_seq_match_test_index();
149
150 add_test_rule(&mut index, "license copyright granted", "test-license");
151
152 let text = "license copyright granted here";
153 let query = Query::from_extracted_text(text, &index, false).unwrap();
154 let query_run = query.whole_query_run();
155
156 let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
157 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
158
159 assert!(!matches.is_empty());
160 assert_eq!(matches[0].matcher, MATCH_SEQ);
161 }
162
163 #[test]
164 fn test_seq_match_partial_coverage_not_filtered() {
165 let mut index = create_seq_match_test_index();
166
167 add_test_rule(
168 &mut index,
169 "license copyright granted permission redistribute",
170 "test-long-license",
171 );
172
173 let text = "license copyright";
174 let query = Query::from_extracted_text(text, &index, false).unwrap();
175 let query_run = query.whole_query_run();
176
177 let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
178 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
179
180 assert!(
181 !matches.is_empty(),
182 "Partial coverage matches should NOT be filtered (Python has no 50% coverage filter)"
183 );
184 assert!(matches[0].match_coverage < 50.0);
185 }
186
187 #[test]
188 fn test_seq_match_empty_query() {
189 let mut index = create_seq_match_test_index();
190
191 add_test_rule(&mut index, "license copyright", "test-license");
192
193 let text = "";
194 let query = Query::from_extracted_text(text, &index, false).unwrap();
195 let query_run = query.whole_query_run();
196
197 let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
198 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
199
200 assert!(matches.is_empty());
201 }
202
203 #[test]
204 fn test_seq_match_constants() {
205 assert_eq!(MATCH_SEQ.as_str(), "3-seq");
206 assert_eq!(MATCH_SEQ.precedence(), 3);
207 }
208
209 #[test]
210 fn test_seq_match_with_no_legalese_intersection() {
211 let mut index = create_test_index(&[("word1", 10), ("word2", 11), ("word3", 12)], 5);
212
213 add_test_rule(&mut index, "word1 word2 word3", "test-license");
214
215 let text = "word1 word2 word3";
216 let query = Query::from_extracted_text(text, &index, false).unwrap();
217 let query_run = query.whole_query_run();
218
219 let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
220 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
221
222 assert!(
223 matches.is_empty(),
224 "Should not match when tokens are not legalese (above len_legalese)"
225 );
226 }
227
228 #[test]
229 fn test_seq_match_multiple_occurrences() {
230 let mut index = create_seq_match_test_index();
231
232 add_test_rule(&mut index, "license copyright granted", "test-license");
233
234 let text = "license copyright granted some text license copyright granted more text";
235 let query = Query::from_extracted_text(text, &index, false).unwrap();
236 let query_run = query.whole_query_run();
237
238 let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
239 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
240
241 assert!(
242 matches.len() >= 2,
243 "Should find multiple matches for the same rule appearing multiple times in query, got {} matches",
244 matches.len()
245 );
246
247 let license_expressions: Vec<&str> = matches
248 .iter()
249 .map(|m| m.license_expression.as_str())
250 .collect();
251 assert!(
252 license_expressions.iter().all(|&e| e == "test-license"),
253 "All matches should be for test-license"
254 );
255
256 let start_lines: Vec<usize> = matches.iter().map(|m| m.start_line).collect();
257 let end_lines: Vec<usize> = matches.iter().map(|m| m.end_line).collect();
258
259 assert!(
260 start_lines.iter().all(|&l| l >= 1),
261 "Start lines should be valid"
262 );
263 assert!(
264 end_lines.iter().all(|&l| l >= 1),
265 "End lines should be valid"
266 );
267 }
268
269 #[test]
270 fn test_seq_match_line_numbers_accurate() {
271 let mut index = create_seq_match_test_index();
272
273 add_test_rule(&mut index, "license copyright granted", "test-license");
274
275 let text = "line one\nlicense copyright granted\nline three";
276 let query = Query::from_extracted_text(text, &index, false).unwrap();
277 let query_run = query.whole_query_run();
278
279 let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
280 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
281
282 assert!(!matches.is_empty(), "Should find matches");
283
284 let first_match = &matches[0];
285
286 assert_eq!(
287 first_match.start_line, 2,
288 "Match should start on line 2 (where license tokens are), not line 1"
289 );
290 assert_eq!(
291 first_match.end_line, 2,
292 "Match should end on line 2 (where license tokens are), not line 3"
293 );
294
295 assert!(
297 first_match.matched_text.is_none(),
298 "matched_text should be None during matching (computed lazily at output)"
299 );
300
301 let matched_text = query.matched_text(first_match.start_line, first_match.end_line);
303 assert!(
304 matched_text.contains("license"),
305 "Computed matched text should contain 'license'"
306 );
307 }
308
309 #[test]
310 fn test_seq_match_line_numbers_partial_match() {
311 let mut index = create_seq_match_test_index();
312
313 add_test_rule(
314 &mut index,
315 "license copyright granted permission",
316 "test-license",
317 );
318
319 let text = "line one\nlicense copyright\nline three";
320 let query = Query::from_extracted_text(text, &index, false).unwrap();
321 let query_run = query.whole_query_run();
322
323 let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
324 let matches = seq_match_with_candidates(&index, &query_run, &candidates);
325
326 assert!(!matches.is_empty(), "Should find partial matches");
327
328 let first_match = &matches[0];
329
330 assert_eq!(
331 first_match.start_line, 2,
332 "Partial match should start on line 2"
333 );
334 assert_eq!(
335 first_match.end_line, 2,
336 "Partial match should end on line 2"
337 );
338
339 assert!(
340 first_match.match_coverage < 100.0,
341 "Should be partial coverage"
342 );
343 }
344}