Skip to main content

provenant/license_detection/seq_match/
mod.rs

1//! Approximate sequence matching for license detection.
2//!
3//! This module implements sequence-based matching using set similarity for
4//! candidate selection, followed by sequence alignment to find matching blocks.
5//!
6//! Based on Python ScanCode Toolkit implementation at:
7//! reference/scancode-toolkit/src/licensedcode/match_seq.py
8//!
9//! ## Near-Duplicate Detection
10//!
11//! This module implements Phase 2 of Python's 3-phase matching pipeline:
12//! 1. Phase 1: Hash & Aho-Corasick (exact matches)
13//! 2. Phase 2: Near-duplicate detection - check whole file for high-resemblance candidates
14//! 3. Phase 3: Query run matching (if no near-duplicates found)
15//!
16//! The near-duplicate detection finds rules with high resemblance (>= 0.8) to the
17//! entire query, which helps match combined rules instead of partial rules.
18
19mod candidates;
20mod matching;
21
22#[cfg(test)]
23mod gfdl_debug_test;
24
25pub use candidates::{Candidate, compute_candidates_with_msets};
26pub use matching::seq_match_with_candidates;
27
28use crate::license_detection::models::MatcherKind;
29
30pub const MATCH_SEQ: MatcherKind = MatcherKind::Seq;
31
32/// Default threshold for high resemblance (0.8 = 80% similarity).
33pub const HIGH_RESEMBLANCE_THRESHOLD: f32 = 0.8;
34
35/// Default number of top near-duplicate candidates to consider.
36pub const MAX_NEAR_DUPE_CANDIDATES: usize = 10;
37
38#[cfg(test)]
39mod tests {
40    use super::*;
41    use crate::license_detection::index::IndexedRuleMetadata;
42    use crate::license_detection::index::LicenseIndex;
43    use crate::license_detection::index::dictionary::{TokenId, TokenKind};
44    use crate::license_detection::index::token_sets::build_set_and_mset;
45    use crate::license_detection::models::Rule;
46    use crate::license_detection::query::Query;
47    use crate::license_detection::test_utils::create_test_index;
48    use crate::license_detection::token_set::TokenSet;
49    use std::collections::HashMap;
50
51    pub(super) fn create_seq_match_test_index() -> LicenseIndex {
52        create_test_index(
53            &[
54                ("license", 0),
55                ("copyright", 1),
56                ("permission", 2),
57                ("redistribute", 3),
58                ("granted", 4),
59            ],
60            5,
61        )
62    }
63
64    pub(super) fn add_test_rule(index: &mut LicenseIndex, text: &str, expression: &str) -> usize {
65        let rid = index.rules_by_rid.len();
66        let tokens: Vec<TokenId> = text
67            .split_whitespace()
68            .filter_map(|word| index.dictionary.get(word))
69            .collect();
70
71        let (set, mset) = build_set_and_mset(&tokens);
72        let token_set: TokenSet = TokenSet::from_u16_iter(set.iter().map(|tid| tid.raw()));
73        let _ = index.sets_by_rid.insert(rid, token_set);
74        let _ = index.msets_by_rid.insert(rid, mset);
75
76        let high_set: TokenSet = TokenSet::from_u16_iter(
77            set.iter()
78                .filter(|&&tid| index.dictionary.token_kind(tid) == TokenKind::Legalese)
79                .map(|tid| tid.raw()),
80        );
81        if !high_set.is_empty() {
82            let _ = index.high_sets_by_rid.insert(rid, high_set);
83        }
84
85        let mut high_postings: HashMap<TokenId, Vec<usize>> = HashMap::new();
86        for (pos, &tid) in tokens.iter().enumerate() {
87            if index.dictionary.token_kind(tid) == TokenKind::Legalese {
88                high_postings.entry(tid).or_default().push(pos);
89            }
90        }
91        let _ = index.high_postings_by_rid.insert(rid, high_postings);
92
93        let rule = Rule {
94            identifier: format!("{}.test", expression),
95            license_expression: expression.to_string(),
96            text: text.to_string(),
97            tokens: tokens.clone(),
98            rule_kind: crate::license_detection::models::RuleKind::Text,
99            is_false_positive: false,
100            is_required_phrase: false,
101            is_from_license: false,
102            relevance: 100,
103            minimum_coverage: None,
104            has_stored_minimum_coverage: false,
105            is_continuous: true,
106            referenced_filenames: None,
107            ignorable_urls: None,
108            ignorable_emails: None,
109            ignorable_copyrights: None,
110            ignorable_holders: None,
111            ignorable_authors: None,
112            language: None,
113            notes: None,
114            length_unique: tokens.len(),
115            high_length_unique: tokens
116                .iter()
117                .filter(|&&t| index.dictionary.token_kind(t) == TokenKind::Legalese)
118                .count(),
119            high_length: tokens.len(),
120            min_matched_length: 1,
121            min_high_matched_length: 1,
122            min_matched_length_unique: 1,
123            min_high_matched_length_unique: 1,
124            is_small: false,
125            is_tiny: false,
126            starts_with_license: false,
127            ends_with_license: false,
128            is_deprecated: false,
129            spdx_license_key: None,
130            other_spdx_license_keys: vec![],
131            required_phrase_spans: vec![],
132            stopwords_by_pos: std::collections::HashMap::new(),
133        };
134
135        index.rules_by_rid.push(rule.clone());
136        index.tids_by_rid.push(tokens.clone());
137        index.approx_matchable_rids.insert(rid);
138
139        // Also populate inverted index for high-value tokens
140        for &tid in &tokens {
141            if index.dictionary.token_kind(tid) == TokenKind::Legalese {
142                index.rids_by_high_tid.entry(tid).or_default().insert(rid);
143            }
144        }
145
146        rid
147    }
148
149    #[test]
150    fn test_seq_match_basic() {
151        let mut index = create_seq_match_test_index();
152
153        add_test_rule(&mut index, "license copyright granted", "test-license");
154
155        let text = "license copyright granted here";
156        let query = Query::from_extracted_text(text, &index, false).unwrap();
157        let query_run = query.whole_query_run();
158
159        let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
160        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
161
162        assert!(!matches.is_empty());
163        assert_eq!(matches[0].matcher, MATCH_SEQ);
164    }
165
166    #[test]
167    fn test_seq_match_uses_precomputed_spdx_expression() {
168        let mut index = create_seq_match_test_index();
169
170        add_test_rule(&mut index, "license copyright", "mit");
171        index.rule_metadata_by_identifier.insert(
172            "mit.test".to_string(),
173            IndexedRuleMetadata {
174                license_expression_spdx: Some("MIT".to_string()),
175                skip_for_required_phrase_generation: false,
176                replaced_by: vec![],
177            },
178        );
179
180        let text = "license copyright";
181        let query = Query::from_extracted_text(text, &index, false).unwrap();
182        let query_run = query.whole_query_run();
183        let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
184        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
185
186        assert_eq!(matches[0].license_expression_spdx.as_deref(), Some("MIT"));
187    }
188
189    #[test]
190    fn test_seq_match_partial_coverage_not_filtered() {
191        let mut index = create_seq_match_test_index();
192
193        add_test_rule(
194            &mut index,
195            "license copyright granted permission redistribute",
196            "test-long-license",
197        );
198
199        let text = "license copyright";
200        let query = Query::from_extracted_text(text, &index, false).unwrap();
201        let query_run = query.whole_query_run();
202
203        let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
204        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
205
206        assert!(
207            !matches.is_empty(),
208            "Partial coverage matches should NOT be filtered (Python has no 50% coverage filter)"
209        );
210        assert!(matches[0].match_coverage < 50.0);
211    }
212
213    #[test]
214    fn test_seq_match_empty_query() {
215        let mut index = create_seq_match_test_index();
216
217        add_test_rule(&mut index, "license copyright", "test-license");
218
219        let text = "";
220        let query = Query::from_extracted_text(text, &index, false).unwrap();
221        let query_run = query.whole_query_run();
222
223        let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
224        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
225
226        assert!(matches.is_empty());
227    }
228
229    #[test]
230    fn test_seq_match_constants() {
231        assert_eq!(MATCH_SEQ.as_str(), "3-seq");
232        assert_eq!(MATCH_SEQ.precedence(), 3);
233    }
234
235    #[test]
236    fn test_seq_match_with_no_legalese_intersection() {
237        let mut index = create_test_index(&[("word1", 10), ("word2", 11), ("word3", 12)], 5);
238
239        add_test_rule(&mut index, "word1 word2 word3", "test-license");
240
241        let text = "word1 word2 word3";
242        let query = Query::from_extracted_text(text, &index, false).unwrap();
243        let query_run = query.whole_query_run();
244
245        let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
246        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
247
248        assert!(
249            matches.is_empty(),
250            "Should not match when tokens are not legalese (above len_legalese)"
251        );
252    }
253
254    #[test]
255    fn test_seq_match_multiple_occurrences() {
256        let mut index = create_seq_match_test_index();
257
258        add_test_rule(&mut index, "license copyright granted", "test-license");
259
260        let text = "license copyright granted some text license copyright granted more text";
261        let query = Query::from_extracted_text(text, &index, false).unwrap();
262        let query_run = query.whole_query_run();
263
264        let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
265        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
266
267        assert!(
268            matches.len() >= 2,
269            "Should find multiple matches for the same rule appearing multiple times in query, got {} matches",
270            matches.len()
271        );
272
273        let license_expressions: Vec<&str> = matches
274            .iter()
275            .map(|m| m.license_expression.as_str())
276            .collect();
277        assert!(
278            license_expressions.iter().all(|&e| e == "test-license"),
279            "All matches should be for test-license"
280        );
281
282        let start_lines: Vec<usize> = matches.iter().map(|m| m.start_line).collect();
283        let end_lines: Vec<usize> = matches.iter().map(|m| m.end_line).collect();
284
285        assert!(
286            start_lines.iter().all(|&l| l >= 1),
287            "Start lines should be valid"
288        );
289        assert!(
290            end_lines.iter().all(|&l| l >= 1),
291            "End lines should be valid"
292        );
293    }
294
295    #[test]
296    fn test_seq_match_line_numbers_accurate() {
297        let mut index = create_seq_match_test_index();
298
299        add_test_rule(&mut index, "license copyright granted", "test-license");
300
301        let text = "line one\nlicense copyright granted\nline three";
302        let query = Query::from_extracted_text(text, &index, false).unwrap();
303        let query_run = query.whole_query_run();
304
305        let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
306        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
307
308        assert!(!matches.is_empty(), "Should find matches");
309
310        let first_match = &matches[0];
311
312        assert_eq!(
313            first_match.start_line, 2,
314            "Match should start on line 2 (where license tokens are), not line 1"
315        );
316        assert_eq!(
317            first_match.end_line, 2,
318            "Match should end on line 2 (where license tokens are), not line 3"
319        );
320
321        // matched_text is computed lazily at output time, not during matching
322        assert!(
323            first_match.matched_text.is_none(),
324            "matched_text should be None during matching (computed lazily at output)"
325        );
326
327        // Verify we can compute it from the query
328        let matched_text = query.matched_text(first_match.start_line, first_match.end_line);
329        assert!(
330            matched_text.contains("license"),
331            "Computed matched text should contain 'license'"
332        );
333    }
334
335    #[test]
336    fn test_seq_match_line_numbers_partial_match() {
337        let mut index = create_seq_match_test_index();
338
339        add_test_rule(
340            &mut index,
341            "license copyright granted permission",
342            "test-license",
343        );
344
345        let text = "line one\nlicense copyright\nline three";
346        let query = Query::from_extracted_text(text, &index, false).unwrap();
347        let query_run = query.whole_query_run();
348
349        let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
350        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
351
352        assert!(!matches.is_empty(), "Should find partial matches");
353
354        let first_match = &matches[0];
355
356        assert_eq!(
357            first_match.start_line, 2,
358            "Partial match should start on line 2"
359        );
360        assert_eq!(
361            first_match.end_line, 2,
362            "Partial match should end on line 2"
363        );
364
365        assert!(
366            first_match.match_coverage < 100.0,
367            "Should be partial coverage"
368        );
369    }
370}