Skip to main content

provenant/license_detection/seq_match/
mod.rs

1//! Approximate sequence matching for license detection.
2//!
3//! This module implements sequence-based matching using set similarity for
4//! candidate selection, followed by sequence alignment to find matching blocks.
5//!
6//! Based on Python ScanCode Toolkit implementation at:
7//! reference/scancode-toolkit/src/licensedcode/match_seq.py
8//!
9//! ## Near-Duplicate Detection
10//!
11//! This module implements Phase 2 of Python's 3-phase matching pipeline:
12//! 1. Phase 1: Hash & Aho-Corasick (exact matches)
13//! 2. Phase 2: Near-duplicate detection - check whole file for high-resemblance candidates
14//! 3. Phase 3: Query run matching (if no near-duplicates found)
15//!
16//! The near-duplicate detection finds rules with high resemblance (>= 0.8) to the
17//! entire query, which helps match combined rules instead of partial rules.
18
19mod candidates;
20mod matching;
21
22#[cfg(test)]
23mod gfdl_debug_test;
24
25pub(crate) use candidates::{select_seq_candidates, select_seq_candidates_with_deadline};
26pub(crate) use matching::{seq_match_with_candidates, seq_match_with_candidates_and_deadline};
27
28use crate::license_detection::models::MatcherKind;
29
30pub const MATCH_SEQ: MatcherKind = MatcherKind::Seq;
31
32pub const HIGH_RESEMBLANCE_THRESHOLD_TENTHS: u32 = 8;
33
34#[cfg(test)]
35pub const HIGH_RESEMBLANCE_THRESHOLD: f32 = HIGH_RESEMBLANCE_THRESHOLD_TENTHS as f32 / 10.0;
36
37/// Default number of top near-duplicate candidates to consider.
38pub const MAX_NEAR_DUPE_CANDIDATES: usize = 10;
39
40#[cfg(test)]
41mod tests {
42    use super::*;
43    use crate::license_detection::index::IndexedRuleMetadata;
44    use crate::license_detection::index::LicenseIndex;
45    use crate::license_detection::index::dictionary::{TokenId, TokenKind};
46    use crate::license_detection::models::Rule;
47    use crate::license_detection::query::Query;
48    use crate::license_detection::test_utils::create_test_index;
49    use crate::license_detection::{TokenMultiset, TokenSet};
50    use crate::models::LineNumber;
51    use std::collections::HashMap;
52
53    pub(super) fn create_seq_match_test_index() -> LicenseIndex {
54        create_test_index(
55            &[
56                ("license", 0),
57                ("copyright", 1),
58                ("permission", 2),
59                ("redistribute", 3),
60                ("granted", 4),
61            ],
62            5,
63        )
64    }
65
66    pub(super) fn add_test_rule(index: &mut LicenseIndex, text: &str, expression: &str) -> usize {
67        let rid = index.rules_by_rid.len();
68        let tokens: Vec<TokenId> = text
69            .split_whitespace()
70            .filter_map(|word| index.dictionary.get(word))
71            .collect();
72
73        let set = TokenSet::from_token_ids(tokens.iter().copied());
74        let mset = TokenMultiset::from_token_ids(&tokens);
75        let _ = index.sets_by_rid.insert(rid, set.clone());
76        let _ = index.msets_by_rid.insert(rid, mset);
77
78        let high_set: TokenSet =
79            TokenSet::from_u16_iter(set.iter().filter(|&tid| {
80                index.dictionary.token_kind(TokenId::new(tid)) == TokenKind::Legalese
81            }));
82        if !high_set.is_empty() {
83            let _ = index.high_sets_by_rid.insert(rid, high_set);
84        }
85
86        let mut high_postings: HashMap<TokenId, Vec<usize>> = HashMap::new();
87        for (pos, &tid) in tokens.iter().enumerate() {
88            if index.dictionary.token_kind(tid) == TokenKind::Legalese {
89                high_postings.entry(tid).or_default().push(pos);
90            }
91        }
92        let _ = index.high_postings_by_rid.insert(rid, high_postings);
93
94        let rule = Rule {
95            identifier: format!("{}.test", expression),
96            license_expression: expression.to_string(),
97            text: text.to_string(),
98            tokens: tokens.clone(),
99            rule_kind: crate::license_detection::models::RuleKind::Text,
100            is_false_positive: false,
101            is_required_phrase: false,
102            is_from_license: false,
103            relevance: 100,
104            minimum_coverage: None,
105            has_stored_minimum_coverage: false,
106            is_continuous: true,
107            referenced_filenames: None,
108            ignorable_urls: None,
109            ignorable_emails: None,
110            ignorable_copyrights: None,
111            ignorable_holders: None,
112            ignorable_authors: None,
113            language: None,
114            notes: None,
115            length_unique: tokens.len(),
116            high_length_unique: tokens
117                .iter()
118                .filter(|&&t| index.dictionary.token_kind(t) == TokenKind::Legalese)
119                .count(),
120            high_length: tokens.len(),
121            min_matched_length: 1,
122            min_high_matched_length: 1,
123            min_matched_length_unique: 1,
124            min_high_matched_length_unique: 1,
125            is_small: false,
126            is_tiny: false,
127            starts_with_license: false,
128            ends_with_license: false,
129            is_deprecated: false,
130            spdx_license_key: None,
131            other_spdx_license_keys: vec![],
132            required_phrase_spans: vec![],
133            stopwords_by_pos: std::collections::HashMap::new(),
134        };
135
136        index.rules_by_rid.push(rule.clone());
137        index.tids_by_rid.push(tokens.clone());
138        index.approx_matchable_rids.insert(rid);
139
140        // Also populate inverted index for high-value tokens
141        for &tid in &tokens {
142            if index.dictionary.token_kind(tid) == TokenKind::Legalese {
143                index.rids_by_high_tid.entry(tid).or_default().insert(rid);
144            }
145        }
146
147        rid
148    }
149
150    #[test]
151    fn test_seq_match_basic() {
152        let mut index = create_seq_match_test_index();
153
154        add_test_rule(&mut index, "license copyright granted", "test-license");
155
156        let text = "license copyright granted here";
157        let query = Query::from_extracted_text(text, &index, false).unwrap();
158        let query_run = query.whole_query_run();
159
160        let candidates = select_seq_candidates(&index, &query_run, false, 50);
161        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
162
163        assert!(!matches.is_empty());
164        assert_eq!(matches[0].matcher, MATCH_SEQ);
165    }
166
167    #[test]
168    fn test_seq_match_uses_precomputed_spdx_expression() {
169        let mut index = create_seq_match_test_index();
170
171        add_test_rule(&mut index, "license copyright", "mit");
172        index.rule_metadata_by_identifier.insert(
173            "mit.test".to_string(),
174            IndexedRuleMetadata {
175                license_expression_spdx: Some("MIT".to_string()),
176                skip_for_required_phrase_generation: false,
177                replaced_by: vec![],
178            },
179        );
180
181        let text = "license copyright";
182        let query = Query::from_extracted_text(text, &index, false).unwrap();
183        let query_run = query.whole_query_run();
184        let candidates = select_seq_candidates(&index, &query_run, false, 50);
185        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
186
187        assert_eq!(matches[0].license_expression_spdx.as_deref(), Some("MIT"));
188    }
189
190    #[test]
191    fn test_seq_match_partial_coverage_not_filtered() {
192        let mut index = create_seq_match_test_index();
193
194        add_test_rule(
195            &mut index,
196            "license copyright granted permission redistribute",
197            "test-long-license",
198        );
199
200        let text = "license copyright";
201        let query = Query::from_extracted_text(text, &index, false).unwrap();
202        let query_run = query.whole_query_run();
203
204        let candidates = select_seq_candidates(&index, &query_run, false, 50);
205        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
206
207        assert!(
208            !matches.is_empty(),
209            "Partial coverage matches should NOT be filtered (Python has no 50% coverage filter)"
210        );
211        assert!(matches[0].match_coverage < 50.0);
212    }
213
214    #[test]
215    fn test_seq_match_empty_query() {
216        let mut index = create_seq_match_test_index();
217
218        add_test_rule(&mut index, "license copyright", "test-license");
219
220        let text = "";
221        let query = Query::from_extracted_text(text, &index, false).unwrap();
222        let query_run = query.whole_query_run();
223
224        let candidates = select_seq_candidates(&index, &query_run, false, 50);
225        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
226
227        assert!(matches.is_empty());
228    }
229
230    #[test]
231    fn test_seq_match_constants() {
232        assert_eq!(MATCH_SEQ.as_str(), "3-seq");
233        assert_eq!(MATCH_SEQ.precedence(), 3);
234    }
235
236    #[test]
237    fn test_seq_match_with_no_legalese_intersection() {
238        let mut index = create_test_index(&[("word1", 10), ("word2", 11), ("word3", 12)], 5);
239
240        add_test_rule(&mut index, "word1 word2 word3", "test-license");
241
242        let text = "word1 word2 word3";
243        let query = Query::from_extracted_text(text, &index, false).unwrap();
244        let query_run = query.whole_query_run();
245
246        let candidates = select_seq_candidates(&index, &query_run, false, 50);
247        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
248
249        assert!(
250            matches.is_empty(),
251            "Should not match when tokens are not legalese (above len_legalese)"
252        );
253    }
254
255    #[test]
256    fn test_seq_match_multiple_occurrences() {
257        let mut index = create_seq_match_test_index();
258
259        add_test_rule(&mut index, "license copyright granted", "test-license");
260
261        let text = "license copyright granted some text license copyright granted more text";
262        let query = Query::from_extracted_text(text, &index, false).unwrap();
263        let query_run = query.whole_query_run();
264
265        let candidates = select_seq_candidates(&index, &query_run, false, 50);
266        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
267
268        assert!(
269            matches.len() >= 2,
270            "Should find multiple matches for the same rule appearing multiple times in query, got {} matches",
271            matches.len()
272        );
273
274        let license_expressions: Vec<&str> = matches
275            .iter()
276            .map(|m| m.license_expression.as_str())
277            .collect();
278        assert!(
279            license_expressions.iter().all(|&e| e == "test-license"),
280            "All matches should be for test-license"
281        );
282
283        let start_lines: Vec<usize> = matches.iter().map(|m| m.start_line.get()).collect();
284        let end_lines: Vec<usize> = matches.iter().map(|m| m.end_line.get()).collect();
285
286        assert!(
287            start_lines.iter().all(|&l| l >= 1),
288            "Start lines should be valid"
289        );
290        assert!(
291            end_lines.iter().all(|&l| l >= 1),
292            "End lines should be valid"
293        );
294    }
295
296    #[test]
297    fn test_seq_match_line_numbers_accurate() {
298        let mut index = create_seq_match_test_index();
299
300        add_test_rule(&mut index, "license copyright granted", "test-license");
301
302        let text = "line one\nlicense copyright granted\nline three";
303        let query = Query::from_extracted_text(text, &index, false).unwrap();
304        let query_run = query.whole_query_run();
305
306        let candidates = select_seq_candidates(&index, &query_run, false, 50);
307        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
308
309        assert!(!matches.is_empty(), "Should find matches");
310
311        let first_match = &matches[0];
312
313        assert_eq!(
314            first_match.start_line,
315            LineNumber::new(2).unwrap(),
316            "Match should start on line 2 (where license tokens are), not line 1"
317        );
318        assert_eq!(
319            first_match.end_line,
320            LineNumber::new(2).unwrap(),
321            "Match should end on line 2 (where license tokens are), not line 3"
322        );
323
324        // matched_text is computed lazily at output time, not during matching
325        assert!(
326            first_match.matched_text.is_none(),
327            "matched_text should be None during matching (computed lazily at output)"
328        );
329
330        // Verify we can compute it from the query
331        let matched_text =
332            query.matched_text(first_match.start_line.get(), first_match.end_line.get());
333        assert!(
334            matched_text.contains("license"),
335            "Computed matched text should contain 'license'"
336        );
337    }
338
339    #[test]
340    fn test_seq_match_line_numbers_partial_match() {
341        let mut index = create_seq_match_test_index();
342
343        add_test_rule(
344            &mut index,
345            "license copyright granted permission",
346            "test-license",
347        );
348
349        let text = "line one\nlicense copyright\nline three";
350        let query = Query::from_extracted_text(text, &index, false).unwrap();
351        let query_run = query.whole_query_run();
352
353        let candidates = select_seq_candidates(&index, &query_run, false, 50);
354        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
355
356        assert!(!matches.is_empty(), "Should find partial matches");
357
358        let first_match = &matches[0];
359
360        assert_eq!(
361            first_match.start_line,
362            LineNumber::new(2).unwrap(),
363            "Partial match should start on line 2"
364        );
365        assert_eq!(
366            first_match.end_line,
367            LineNumber::new(2).unwrap(),
368            "Partial match should end on line 2"
369        );
370
371        assert!(
372            first_match.match_coverage < 100.0,
373            "Should be partial coverage"
374        );
375    }
376}