Skip to main content

provenant/license_detection/seq_match/
mod.rs

1//! Approximate sequence matching for license detection.
2//!
3//! This module implements sequence-based matching using set similarity for
4//! candidate selection, followed by sequence alignment to find matching blocks.
5//!
6//! Based on Python ScanCode Toolkit implementation at:
7//! reference/scancode-toolkit/src/licensedcode/match_seq.py
8//!
9//! ## Near-Duplicate Detection
10//!
11//! This module implements Phase 2 of Python's 3-phase matching pipeline:
12//! 1. Phase 1: Hash & Aho-Corasick (exact matches)
13//! 2. Phase 2: Near-duplicate detection - check whole file for high-resemblance candidates
14//! 3. Phase 3: Query run matching (if no near-duplicates found)
15//!
16//! The near-duplicate detection finds rules with high resemblance (>= 0.8) to the
17//! entire query, which helps match combined rules instead of partial rules.
18
19mod candidates;
20mod matching;
21
22#[cfg(test)]
23mod gfdl_debug_test;
24
25pub(crate) use candidates::select_seq_candidates;
26pub(crate) use matching::seq_match_with_candidates;
27
28use crate::license_detection::models::MatcherKind;
29
30pub const MATCH_SEQ: MatcherKind = MatcherKind::Seq;
31
32/// Default threshold for high resemblance (0.8 = 80% similarity).
33pub const HIGH_RESEMBLANCE_THRESHOLD: f32 = 0.8;
34
35/// Default number of top near-duplicate candidates to consider.
36pub const MAX_NEAR_DUPE_CANDIDATES: usize = 10;
37
38#[cfg(test)]
39mod tests {
40    use super::*;
41    use crate::license_detection::index::IndexedRuleMetadata;
42    use crate::license_detection::index::LicenseIndex;
43    use crate::license_detection::index::dictionary::{TokenId, TokenKind};
44    use crate::license_detection::models::Rule;
45    use crate::license_detection::query::Query;
46    use crate::license_detection::test_utils::create_test_index;
47    use crate::license_detection::{TokenMultiset, TokenSet};
48    use std::collections::HashMap;
49
50    pub(super) fn create_seq_match_test_index() -> LicenseIndex {
51        create_test_index(
52            &[
53                ("license", 0),
54                ("copyright", 1),
55                ("permission", 2),
56                ("redistribute", 3),
57                ("granted", 4),
58            ],
59            5,
60        )
61    }
62
63    pub(super) fn add_test_rule(index: &mut LicenseIndex, text: &str, expression: &str) -> usize {
64        let rid = index.rules_by_rid.len();
65        let tokens: Vec<TokenId> = text
66            .split_whitespace()
67            .filter_map(|word| index.dictionary.get(word))
68            .collect();
69
70        let set = TokenSet::from_token_ids(tokens.iter().copied());
71        let mset = TokenMultiset::from_token_ids(&tokens);
72        let _ = index.sets_by_rid.insert(rid, set.clone());
73        let _ = index.msets_by_rid.insert(rid, mset);
74
75        let high_set: TokenSet =
76            TokenSet::from_u16_iter(set.iter().filter(|&tid| {
77                index.dictionary.token_kind(TokenId::new(tid)) == TokenKind::Legalese
78            }));
79        if !high_set.is_empty() {
80            let _ = index.high_sets_by_rid.insert(rid, high_set);
81        }
82
83        let mut high_postings: HashMap<TokenId, Vec<usize>> = HashMap::new();
84        for (pos, &tid) in tokens.iter().enumerate() {
85            if index.dictionary.token_kind(tid) == TokenKind::Legalese {
86                high_postings.entry(tid).or_default().push(pos);
87            }
88        }
89        let _ = index.high_postings_by_rid.insert(rid, high_postings);
90
91        let rule = Rule {
92            identifier: format!("{}.test", expression),
93            license_expression: expression.to_string(),
94            text: text.to_string(),
95            tokens: tokens.clone(),
96            rule_kind: crate::license_detection::models::RuleKind::Text,
97            is_false_positive: false,
98            is_required_phrase: false,
99            is_from_license: false,
100            relevance: 100,
101            minimum_coverage: None,
102            has_stored_minimum_coverage: false,
103            is_continuous: true,
104            referenced_filenames: None,
105            ignorable_urls: None,
106            ignorable_emails: None,
107            ignorable_copyrights: None,
108            ignorable_holders: None,
109            ignorable_authors: None,
110            language: None,
111            notes: None,
112            length_unique: tokens.len(),
113            high_length_unique: tokens
114                .iter()
115                .filter(|&&t| index.dictionary.token_kind(t) == TokenKind::Legalese)
116                .count(),
117            high_length: tokens.len(),
118            min_matched_length: 1,
119            min_high_matched_length: 1,
120            min_matched_length_unique: 1,
121            min_high_matched_length_unique: 1,
122            is_small: false,
123            is_tiny: false,
124            starts_with_license: false,
125            ends_with_license: false,
126            is_deprecated: false,
127            spdx_license_key: None,
128            other_spdx_license_keys: vec![],
129            required_phrase_spans: vec![],
130            stopwords_by_pos: std::collections::HashMap::new(),
131        };
132
133        index.rules_by_rid.push(rule.clone());
134        index.tids_by_rid.push(tokens.clone());
135        index.approx_matchable_rids.insert(rid);
136
137        // Also populate inverted index for high-value tokens
138        for &tid in &tokens {
139            if index.dictionary.token_kind(tid) == TokenKind::Legalese {
140                index.rids_by_high_tid.entry(tid).or_default().insert(rid);
141            }
142        }
143
144        rid
145    }
146
147    #[test]
148    fn test_seq_match_basic() {
149        let mut index = create_seq_match_test_index();
150
151        add_test_rule(&mut index, "license copyright granted", "test-license");
152
153        let text = "license copyright granted here";
154        let query = Query::from_extracted_text(text, &index, false).unwrap();
155        let query_run = query.whole_query_run();
156
157        let candidates = select_seq_candidates(&index, &query_run, false, 50);
158        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
159
160        assert!(!matches.is_empty());
161        assert_eq!(matches[0].matcher, MATCH_SEQ);
162    }
163
164    #[test]
165    fn test_seq_match_uses_precomputed_spdx_expression() {
166        let mut index = create_seq_match_test_index();
167
168        add_test_rule(&mut index, "license copyright", "mit");
169        index.rule_metadata_by_identifier.insert(
170            "mit.test".to_string(),
171            IndexedRuleMetadata {
172                license_expression_spdx: Some("MIT".to_string()),
173                skip_for_required_phrase_generation: false,
174                replaced_by: vec![],
175            },
176        );
177
178        let text = "license copyright";
179        let query = Query::from_extracted_text(text, &index, false).unwrap();
180        let query_run = query.whole_query_run();
181        let candidates = select_seq_candidates(&index, &query_run, false, 50);
182        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
183
184        assert_eq!(matches[0].license_expression_spdx.as_deref(), Some("MIT"));
185    }
186
187    #[test]
188    fn test_seq_match_partial_coverage_not_filtered() {
189        let mut index = create_seq_match_test_index();
190
191        add_test_rule(
192            &mut index,
193            "license copyright granted permission redistribute",
194            "test-long-license",
195        );
196
197        let text = "license copyright";
198        let query = Query::from_extracted_text(text, &index, false).unwrap();
199        let query_run = query.whole_query_run();
200
201        let candidates = select_seq_candidates(&index, &query_run, false, 50);
202        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
203
204        assert!(
205            !matches.is_empty(),
206            "Partial coverage matches should NOT be filtered (Python has no 50% coverage filter)"
207        );
208        assert!(matches[0].match_coverage < 50.0);
209    }
210
211    #[test]
212    fn test_seq_match_empty_query() {
213        let mut index = create_seq_match_test_index();
214
215        add_test_rule(&mut index, "license copyright", "test-license");
216
217        let text = "";
218        let query = Query::from_extracted_text(text, &index, false).unwrap();
219        let query_run = query.whole_query_run();
220
221        let candidates = select_seq_candidates(&index, &query_run, false, 50);
222        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
223
224        assert!(matches.is_empty());
225    }
226
227    #[test]
228    fn test_seq_match_constants() {
229        assert_eq!(MATCH_SEQ.as_str(), "3-seq");
230        assert_eq!(MATCH_SEQ.precedence(), 3);
231    }
232
233    #[test]
234    fn test_seq_match_with_no_legalese_intersection() {
235        let mut index = create_test_index(&[("word1", 10), ("word2", 11), ("word3", 12)], 5);
236
237        add_test_rule(&mut index, "word1 word2 word3", "test-license");
238
239        let text = "word1 word2 word3";
240        let query = Query::from_extracted_text(text, &index, false).unwrap();
241        let query_run = query.whole_query_run();
242
243        let candidates = select_seq_candidates(&index, &query_run, false, 50);
244        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
245
246        assert!(
247            matches.is_empty(),
248            "Should not match when tokens are not legalese (above len_legalese)"
249        );
250    }
251
252    #[test]
253    fn test_seq_match_multiple_occurrences() {
254        let mut index = create_seq_match_test_index();
255
256        add_test_rule(&mut index, "license copyright granted", "test-license");
257
258        let text = "license copyright granted some text license copyright granted more text";
259        let query = Query::from_extracted_text(text, &index, false).unwrap();
260        let query_run = query.whole_query_run();
261
262        let candidates = select_seq_candidates(&index, &query_run, false, 50);
263        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
264
265        assert!(
266            matches.len() >= 2,
267            "Should find multiple matches for the same rule appearing multiple times in query, got {} matches",
268            matches.len()
269        );
270
271        let license_expressions: Vec<&str> = matches
272            .iter()
273            .map(|m| m.license_expression.as_str())
274            .collect();
275        assert!(
276            license_expressions.iter().all(|&e| e == "test-license"),
277            "All matches should be for test-license"
278        );
279
280        let start_lines: Vec<usize> = matches.iter().map(|m| m.start_line).collect();
281        let end_lines: Vec<usize> = matches.iter().map(|m| m.end_line).collect();
282
283        assert!(
284            start_lines.iter().all(|&l| l >= 1),
285            "Start lines should be valid"
286        );
287        assert!(
288            end_lines.iter().all(|&l| l >= 1),
289            "End lines should be valid"
290        );
291    }
292
293    #[test]
294    fn test_seq_match_line_numbers_accurate() {
295        let mut index = create_seq_match_test_index();
296
297        add_test_rule(&mut index, "license copyright granted", "test-license");
298
299        let text = "line one\nlicense copyright granted\nline three";
300        let query = Query::from_extracted_text(text, &index, false).unwrap();
301        let query_run = query.whole_query_run();
302
303        let candidates = select_seq_candidates(&index, &query_run, false, 50);
304        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
305
306        assert!(!matches.is_empty(), "Should find matches");
307
308        let first_match = &matches[0];
309
310        assert_eq!(
311            first_match.start_line, 2,
312            "Match should start on line 2 (where license tokens are), not line 1"
313        );
314        assert_eq!(
315            first_match.end_line, 2,
316            "Match should end on line 2 (where license tokens are), not line 3"
317        );
318
319        // matched_text is computed lazily at output time, not during matching
320        assert!(
321            first_match.matched_text.is_none(),
322            "matched_text should be None during matching (computed lazily at output)"
323        );
324
325        // Verify we can compute it from the query
326        let matched_text = query.matched_text(first_match.start_line, first_match.end_line);
327        assert!(
328            matched_text.contains("license"),
329            "Computed matched text should contain 'license'"
330        );
331    }
332
333    #[test]
334    fn test_seq_match_line_numbers_partial_match() {
335        let mut index = create_seq_match_test_index();
336
337        add_test_rule(
338            &mut index,
339            "license copyright granted permission",
340            "test-license",
341        );
342
343        let text = "line one\nlicense copyright\nline three";
344        let query = Query::from_extracted_text(text, &index, false).unwrap();
345        let query_run = query.whole_query_run();
346
347        let candidates = select_seq_candidates(&index, &query_run, false, 50);
348        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
349
350        assert!(!matches.is_empty(), "Should find partial matches");
351
352        let first_match = &matches[0];
353
354        assert_eq!(
355            first_match.start_line, 2,
356            "Partial match should start on line 2"
357        );
358        assert_eq!(
359            first_match.end_line, 2,
360            "Partial match should end on line 2"
361        );
362
363        assert!(
364            first_match.match_coverage < 100.0,
365            "Should be partial coverage"
366        );
367    }
368}