Skip to main content

provenant/license_detection/seq_match/
mod.rs

1//! Approximate sequence matching for license detection.
2//!
3//! This module implements sequence-based matching using set similarity for
4//! candidate selection, followed by sequence alignment to find matching blocks.
5//!
6//! Based on Python ScanCode Toolkit implementation at:
7//! reference/scancode-toolkit/src/licensedcode/match_seq.py
8//!
9//! ## Near-Duplicate Detection
10//!
11//! This module implements Phase 2 of Python's 3-phase matching pipeline:
12//! 1. Phase 1: Hash & Aho-Corasick (exact matches)
13//! 2. Phase 2: Near-duplicate detection - check whole file for high-resemblance candidates
14//! 3. Phase 3: Query run matching (if no near-duplicates found)
15//!
16//! The near-duplicate detection finds rules with high resemblance (>= 0.8) to the
17//! entire query, which helps match combined rules instead of partial rules.
18
19mod candidates;
20mod matching;
21
22#[cfg(test)]
23mod gfdl_debug_test;
24
25pub use candidates::{Candidate, compute_candidates_with_msets};
26pub use matching::seq_match_with_candidates;
27
28use crate::license_detection::models::MatcherKind;
29
30pub const MATCH_SEQ: MatcherKind = MatcherKind::Seq;
31
32/// Default threshold for high resemblance (0.8 = 80% similarity).
33pub const HIGH_RESEMBLANCE_THRESHOLD: f32 = 0.8;
34
35/// Default number of top near-duplicate candidates to consider.
36pub const MAX_NEAR_DUPE_CANDIDATES: usize = 10;
37
38#[cfg(test)]
39mod tests {
40    use super::*;
41    use crate::license_detection::index::LicenseIndex;
42    use crate::license_detection::index::dictionary::{TokenId, TokenKind};
43    use crate::license_detection::index::token_sets::build_set_and_mset;
44    use crate::license_detection::models::Rule;
45    use crate::license_detection::query::Query;
46    use crate::license_detection::test_utils::create_test_index;
47    use std::collections::{HashMap, HashSet};
48
49    pub(super) fn create_seq_match_test_index() -> LicenseIndex {
50        create_test_index(
51            &[
52                ("license", 0),
53                ("copyright", 1),
54                ("permission", 2),
55                ("redistribute", 3),
56                ("granted", 4),
57            ],
58            5,
59        )
60    }
61
62    pub(super) fn add_test_rule(index: &mut LicenseIndex, text: &str, expression: &str) -> usize {
63        let rid = index.rules_by_rid.len();
64        let tokens: Vec<TokenId> = text
65            .split_whitespace()
66            .filter_map(|word| index.dictionary.get(word))
67            .collect();
68
69        let (set, mset) = build_set_and_mset(&tokens);
70        let _ = index.sets_by_rid.insert(rid, set.clone());
71        let _ = index.msets_by_rid.insert(rid, mset);
72
73        let high_set: HashSet<TokenId> = set
74            .iter()
75            .filter(|&&tid| index.dictionary.token_kind(tid) == TokenKind::Legalese)
76            .copied()
77            .collect();
78        if !high_set.is_empty() {
79            let _ = index.high_sets_by_rid.insert(rid, high_set);
80        }
81
82        let mut high_postings: HashMap<TokenId, Vec<usize>> = HashMap::new();
83        for (pos, &tid) in tokens.iter().enumerate() {
84            if index.dictionary.token_kind(tid) == TokenKind::Legalese {
85                high_postings.entry(tid).or_default().push(pos);
86            }
87        }
88        let _ = index.high_postings_by_rid.insert(rid, high_postings);
89
90        let rule = Rule {
91            identifier: format!("{}.test", expression),
92            license_expression: expression.to_string(),
93            text: text.to_string(),
94            tokens: tokens.clone(),
95            rule_kind: crate::license_detection::models::RuleKind::Text,
96            is_false_positive: false,
97            is_required_phrase: false,
98            is_from_license: false,
99            relevance: 100,
100            minimum_coverage: None,
101            has_stored_minimum_coverage: false,
102            is_continuous: true,
103            referenced_filenames: None,
104            ignorable_urls: None,
105            ignorable_emails: None,
106            ignorable_copyrights: None,
107            ignorable_holders: None,
108            ignorable_authors: None,
109            language: None,
110            notes: None,
111            length_unique: tokens.len(),
112            high_length_unique: tokens
113                .iter()
114                .filter(|&&t| index.dictionary.token_kind(t) == TokenKind::Legalese)
115                .count(),
116            high_length: tokens.len(),
117            min_matched_length: 1,
118            min_high_matched_length: 1,
119            min_matched_length_unique: 1,
120            min_high_matched_length_unique: 1,
121            is_small: false,
122            is_tiny: false,
123            starts_with_license: false,
124            ends_with_license: false,
125            is_deprecated: false,
126            spdx_license_key: None,
127            other_spdx_license_keys: vec![],
128            required_phrase_spans: vec![],
129            stopwords_by_pos: std::collections::HashMap::new(),
130        };
131
132        index.rules_by_rid.push(rule.clone());
133        index.tids_by_rid.push(tokens.clone());
134        index.approx_matchable_rids.insert(rid);
135
136        // Also populate inverted index for high-value tokens
137        for &tid in &tokens {
138            if index.dictionary.token_kind(tid) == TokenKind::Legalese {
139                index.rids_by_high_tid.entry(tid).or_default().insert(rid);
140            }
141        }
142
143        rid
144    }
145
146    #[test]
147    fn test_seq_match_basic() {
148        let mut index = create_seq_match_test_index();
149
150        add_test_rule(&mut index, "license copyright granted", "test-license");
151
152        let text = "license copyright granted here";
153        let query = Query::from_extracted_text(text, &index, false).unwrap();
154        let query_run = query.whole_query_run();
155
156        let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
157        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
158
159        assert!(!matches.is_empty());
160        assert_eq!(matches[0].matcher, MATCH_SEQ);
161    }
162
163    #[test]
164    fn test_seq_match_partial_coverage_not_filtered() {
165        let mut index = create_seq_match_test_index();
166
167        add_test_rule(
168            &mut index,
169            "license copyright granted permission redistribute",
170            "test-long-license",
171        );
172
173        let text = "license copyright";
174        let query = Query::from_extracted_text(text, &index, false).unwrap();
175        let query_run = query.whole_query_run();
176
177        let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
178        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
179
180        assert!(
181            !matches.is_empty(),
182            "Partial coverage matches should NOT be filtered (Python has no 50% coverage filter)"
183        );
184        assert!(matches[0].match_coverage < 50.0);
185    }
186
187    #[test]
188    fn test_seq_match_empty_query() {
189        let mut index = create_seq_match_test_index();
190
191        add_test_rule(&mut index, "license copyright", "test-license");
192
193        let text = "";
194        let query = Query::from_extracted_text(text, &index, false).unwrap();
195        let query_run = query.whole_query_run();
196
197        let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
198        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
199
200        assert!(matches.is_empty());
201    }
202
203    #[test]
204    fn test_seq_match_constants() {
205        assert_eq!(MATCH_SEQ.as_str(), "3-seq");
206        assert_eq!(MATCH_SEQ.precedence(), 3);
207    }
208
209    #[test]
210    fn test_seq_match_with_no_legalese_intersection() {
211        let mut index = create_test_index(&[("word1", 10), ("word2", 11), ("word3", 12)], 5);
212
213        add_test_rule(&mut index, "word1 word2 word3", "test-license");
214
215        let text = "word1 word2 word3";
216        let query = Query::from_extracted_text(text, &index, false).unwrap();
217        let query_run = query.whole_query_run();
218
219        let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
220        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
221
222        assert!(
223            matches.is_empty(),
224            "Should not match when tokens are not legalese (above len_legalese)"
225        );
226    }
227
228    #[test]
229    fn test_seq_match_multiple_occurrences() {
230        let mut index = create_seq_match_test_index();
231
232        add_test_rule(&mut index, "license copyright granted", "test-license");
233
234        let text = "license copyright granted some text license copyright granted more text";
235        let query = Query::from_extracted_text(text, &index, false).unwrap();
236        let query_run = query.whole_query_run();
237
238        let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
239        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
240
241        assert!(
242            matches.len() >= 2,
243            "Should find multiple matches for the same rule appearing multiple times in query, got {} matches",
244            matches.len()
245        );
246
247        let license_expressions: Vec<&str> = matches
248            .iter()
249            .map(|m| m.license_expression.as_str())
250            .collect();
251        assert!(
252            license_expressions.iter().all(|&e| e == "test-license"),
253            "All matches should be for test-license"
254        );
255
256        let start_lines: Vec<usize> = matches.iter().map(|m| m.start_line).collect();
257        let end_lines: Vec<usize> = matches.iter().map(|m| m.end_line).collect();
258
259        assert!(
260            start_lines.iter().all(|&l| l >= 1),
261            "Start lines should be valid"
262        );
263        assert!(
264            end_lines.iter().all(|&l| l >= 1),
265            "End lines should be valid"
266        );
267    }
268
269    #[test]
270    fn test_seq_match_line_numbers_accurate() {
271        let mut index = create_seq_match_test_index();
272
273        add_test_rule(&mut index, "license copyright granted", "test-license");
274
275        let text = "line one\nlicense copyright granted\nline three";
276        let query = Query::from_extracted_text(text, &index, false).unwrap();
277        let query_run = query.whole_query_run();
278
279        let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
280        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
281
282        assert!(!matches.is_empty(), "Should find matches");
283
284        let first_match = &matches[0];
285
286        assert_eq!(
287            first_match.start_line, 2,
288            "Match should start on line 2 (where license tokens are), not line 1"
289        );
290        assert_eq!(
291            first_match.end_line, 2,
292            "Match should end on line 2 (where license tokens are), not line 3"
293        );
294
295        // matched_text is computed lazily at output time, not during matching
296        assert!(
297            first_match.matched_text.is_none(),
298            "matched_text should be None during matching (computed lazily at output)"
299        );
300
301        // Verify we can compute it from the query
302        let matched_text = query.matched_text(first_match.start_line, first_match.end_line);
303        assert!(
304            matched_text.contains("license"),
305            "Computed matched text should contain 'license'"
306        );
307    }
308
309    #[test]
310    fn test_seq_match_line_numbers_partial_match() {
311        let mut index = create_seq_match_test_index();
312
313        add_test_rule(
314            &mut index,
315            "license copyright granted permission",
316            "test-license",
317        );
318
319        let text = "line one\nlicense copyright\nline three";
320        let query = Query::from_extracted_text(text, &index, false).unwrap();
321        let query_run = query.whole_query_run();
322
323        let candidates = compute_candidates_with_msets(&index, &query_run, false, 50);
324        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
325
326        assert!(!matches.is_empty(), "Should find partial matches");
327
328        let first_match = &matches[0];
329
330        assert_eq!(
331            first_match.start_line, 2,
332            "Partial match should start on line 2"
333        );
334        assert_eq!(
335            first_match.end_line, 2,
336            "Partial match should end on line 2"
337        );
338
339        assert!(
340            first_match.match_coverage < 100.0,
341            "Should be partial coverage"
342        );
343    }
344}