Skip to main content

provenant/license_detection/seq_match/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Approximate sequence matching for license detection.
5//!
6//! This module implements sequence-based matching using set similarity for
7//! candidate selection, followed by sequence alignment to find matching blocks.
8//!
9//! Based on Python ScanCode Toolkit implementation at:
10//! reference/scancode-toolkit/src/licensedcode/match_seq.py
11//!
12//! ## Near-Duplicate Detection
13//!
14//! This module implements Phase 2 of Python's 3-phase matching pipeline:
15//! 1. Phase 1: Hash & Aho-Corasick (exact matches)
16//! 2. Phase 2: Near-duplicate detection - check whole file for high-resemblance candidates
17//! 3. Phase 3: Query run matching (if no near-duplicates found)
18//!
19//! The near-duplicate detection finds rules with high resemblance (>= 0.8) to the
20//! entire query, which helps match combined rules instead of partial rules.
21
22mod candidates;
23mod matching;
24
25#[cfg(test)]
26mod gfdl_debug_test;
27
28pub(crate) use candidates::{select_seq_candidates, select_seq_candidates_with_deadline};
29pub(crate) use matching::{seq_match_with_candidates, seq_match_with_candidates_and_deadline};
30
31use crate::license_detection::models::MatcherKind;
32
33pub const MATCH_SEQ: MatcherKind = MatcherKind::Seq;
34
35pub const HIGH_RESEMBLANCE_THRESHOLD_TENTHS: u32 = 8;
36
37#[cfg(test)]
38pub const HIGH_RESEMBLANCE_THRESHOLD: f32 = HIGH_RESEMBLANCE_THRESHOLD_TENTHS as f32 / 10.0;
39
40/// Default number of top near-duplicate candidates to consider.
41pub const MAX_NEAR_DUPE_CANDIDATES: usize = 10;
42
43#[cfg(test)]
44mod tests {
45    use super::*;
46    use crate::license_detection::index::IndexedRuleMetadata;
47    use crate::license_detection::index::LicenseIndex;
48    use crate::license_detection::index::dictionary::{TokenId, TokenKind};
49    use crate::license_detection::models::Rule;
50    use crate::license_detection::query::Query;
51    use crate::license_detection::test_utils::create_test_index;
52    use crate::license_detection::{TokenMultiset, TokenSet};
53    use crate::models::LineNumber;
54    use std::collections::HashMap;
55
56    pub(super) fn create_seq_match_test_index() -> LicenseIndex {
57        create_test_index(
58            &[
59                ("license", 0),
60                ("copyright", 1),
61                ("permission", 2),
62                ("redistribute", 3),
63                ("granted", 4),
64            ],
65            5,
66        )
67    }
68
69    pub(super) fn add_test_rule(index: &mut LicenseIndex, text: &str, expression: &str) -> usize {
70        let rid = index.rules_by_rid.len();
71        let tokens: Vec<TokenId> = text
72            .split_whitespace()
73            .filter_map(|word| index.dictionary.get(word))
74            .collect();
75
76        let set = TokenSet::from_token_ids(tokens.iter().copied());
77        let mset = TokenMultiset::from_token_ids(&tokens);
78        let _ = index.sets_by_rid.insert(rid, set.clone());
79        let _ = index.msets_by_rid.insert(rid, mset);
80
81        let high_set: TokenSet =
82            TokenSet::from_u16_iter(set.iter().filter(|&tid| {
83                index.dictionary.token_kind(TokenId::new(tid)) == TokenKind::Legalese
84            }));
85        if !high_set.is_empty() {
86            let _ = index.high_sets_by_rid.insert(rid, high_set);
87        }
88
89        let mut high_postings: HashMap<TokenId, Vec<usize>> = HashMap::new();
90        for (pos, &tid) in tokens.iter().enumerate() {
91            if index.dictionary.token_kind(tid) == TokenKind::Legalese {
92                high_postings.entry(tid).or_default().push(pos);
93            }
94        }
95        let _ = index.high_postings_by_rid.insert(rid, high_postings);
96
97        let rule = Rule {
98            identifier: format!("{}.test", expression),
99            license_expression: expression.to_string(),
100            text: text.to_string(),
101            tokens: tokens.clone(),
102            rule_kind: crate::license_detection::models::RuleKind::Text,
103            is_false_positive: false,
104            is_required_phrase: false,
105            is_from_license: false,
106            relevance: 100,
107            minimum_coverage: None,
108            has_stored_minimum_coverage: false,
109            is_continuous: true,
110            referenced_filenames: None,
111            ignorable_urls: None,
112            ignorable_emails: None,
113            ignorable_copyrights: None,
114            ignorable_holders: None,
115            ignorable_authors: None,
116            language: None,
117            notes: None,
118            length_unique: tokens.len(),
119            high_length_unique: tokens
120                .iter()
121                .filter(|&&t| index.dictionary.token_kind(t) == TokenKind::Legalese)
122                .count(),
123            high_length: tokens.len(),
124            min_matched_length: 1,
125            min_high_matched_length: 1,
126            min_matched_length_unique: 1,
127            min_high_matched_length_unique: 1,
128            is_small: false,
129            is_tiny: false,
130            starts_with_license: false,
131            ends_with_license: false,
132            is_deprecated: false,
133            spdx_license_key: None,
134            other_spdx_license_keys: vec![],
135            required_phrase_spans: vec![],
136            stopwords_by_pos: std::collections::HashMap::new(),
137        };
138
139        index.rules_by_rid.push(rule.clone());
140        index.tids_by_rid.push(tokens.clone());
141        index.approx_matchable_rids.insert(rid);
142
143        // Also populate inverted index for high-value tokens
144        for &tid in &tokens {
145            if index.dictionary.token_kind(tid) == TokenKind::Legalese {
146                index.rids_by_high_tid.entry(tid).or_default().insert(rid);
147            }
148        }
149
150        rid
151    }
152
153    #[test]
154    fn test_seq_match_basic() {
155        let mut index = create_seq_match_test_index();
156
157        add_test_rule(&mut index, "license copyright granted", "test-license");
158
159        let text = "license copyright granted here";
160        let query = Query::from_extracted_text(text, &index, false).unwrap();
161        let query_run = query.whole_query_run();
162
163        let candidates = select_seq_candidates(&index, &query_run, false, 50);
164        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
165
166        assert!(!matches.is_empty());
167        assert_eq!(matches[0].matcher, MATCH_SEQ);
168    }
169
170    #[test]
171    fn test_seq_match_uses_precomputed_spdx_expression() {
172        let mut index = create_seq_match_test_index();
173
174        add_test_rule(&mut index, "license copyright", "mit");
175        index.rule_metadata_by_identifier.insert(
176            "mit.test".to_string(),
177            IndexedRuleMetadata {
178                license_expression_spdx: Some("MIT".to_string()),
179                skip_for_required_phrase_generation: false,
180                replaced_by: vec![],
181            },
182        );
183
184        let text = "license copyright";
185        let query = Query::from_extracted_text(text, &index, false).unwrap();
186        let query_run = query.whole_query_run();
187        let candidates = select_seq_candidates(&index, &query_run, false, 50);
188        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
189
190        assert_eq!(matches[0].license_expression_spdx.as_deref(), Some("MIT"));
191    }
192
193    #[test]
194    fn test_seq_match_partial_coverage_not_filtered() {
195        let mut index = create_seq_match_test_index();
196
197        add_test_rule(
198            &mut index,
199            "license copyright granted permission redistribute",
200            "test-long-license",
201        );
202
203        let text = "license copyright";
204        let query = Query::from_extracted_text(text, &index, false).unwrap();
205        let query_run = query.whole_query_run();
206
207        let candidates = select_seq_candidates(&index, &query_run, false, 50);
208        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
209
210        assert!(
211            !matches.is_empty(),
212            "Partial coverage matches should NOT be filtered (Python has no 50% coverage filter)"
213        );
214        assert!(matches[0].match_coverage < 50.0);
215    }
216
217    #[test]
218    fn test_seq_match_empty_query() {
219        let mut index = create_seq_match_test_index();
220
221        add_test_rule(&mut index, "license copyright", "test-license");
222
223        let text = "";
224        let query = Query::from_extracted_text(text, &index, false).unwrap();
225        let query_run = query.whole_query_run();
226
227        let candidates = select_seq_candidates(&index, &query_run, false, 50);
228        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
229
230        assert!(matches.is_empty());
231    }
232
233    #[test]
234    fn test_seq_match_constants() {
235        assert_eq!(MATCH_SEQ.as_str(), "3-seq");
236        assert_eq!(MATCH_SEQ.precedence(), 3);
237    }
238
239    #[test]
240    fn test_seq_match_with_no_legalese_intersection() {
241        let mut index = create_test_index(&[("word1", 10), ("word2", 11), ("word3", 12)], 5);
242
243        add_test_rule(&mut index, "word1 word2 word3", "test-license");
244
245        let text = "word1 word2 word3";
246        let query = Query::from_extracted_text(text, &index, false).unwrap();
247        let query_run = query.whole_query_run();
248
249        let candidates = select_seq_candidates(&index, &query_run, false, 50);
250        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
251
252        assert!(
253            matches.is_empty(),
254            "Should not match when tokens are not legalese (above len_legalese)"
255        );
256    }
257
258    #[test]
259    fn test_seq_match_multiple_occurrences() {
260        let mut index = create_seq_match_test_index();
261
262        add_test_rule(&mut index, "license copyright granted", "test-license");
263
264        let text = "license copyright granted some text license copyright granted more text";
265        let query = Query::from_extracted_text(text, &index, false).unwrap();
266        let query_run = query.whole_query_run();
267
268        let candidates = select_seq_candidates(&index, &query_run, false, 50);
269        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
270
271        assert!(
272            matches.len() >= 2,
273            "Should find multiple matches for the same rule appearing multiple times in query, got {} matches",
274            matches.len()
275        );
276
277        let license_expressions: Vec<&str> = matches
278            .iter()
279            .map(|m| m.license_expression.as_str())
280            .collect();
281        assert!(
282            license_expressions.iter().all(|&e| e == "test-license"),
283            "All matches should be for test-license"
284        );
285
286        let start_lines: Vec<usize> = matches.iter().map(|m| m.start_line.get()).collect();
287        let end_lines: Vec<usize> = matches.iter().map(|m| m.end_line.get()).collect();
288
289        assert!(
290            start_lines.iter().all(|&l| l >= 1),
291            "Start lines should be valid"
292        );
293        assert!(
294            end_lines.iter().all(|&l| l >= 1),
295            "End lines should be valid"
296        );
297    }
298
299    #[test]
300    fn test_seq_match_line_numbers_accurate() {
301        let mut index = create_seq_match_test_index();
302
303        add_test_rule(&mut index, "license copyright granted", "test-license");
304
305        let text = "line one\nlicense copyright granted\nline three";
306        let query = Query::from_extracted_text(text, &index, false).unwrap();
307        let query_run = query.whole_query_run();
308
309        let candidates = select_seq_candidates(&index, &query_run, false, 50);
310        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
311
312        assert!(!matches.is_empty(), "Should find matches");
313
314        let first_match = &matches[0];
315
316        assert_eq!(
317            first_match.start_line,
318            LineNumber::new(2).unwrap(),
319            "Match should start on line 2 (where license tokens are), not line 1"
320        );
321        assert_eq!(
322            first_match.end_line,
323            LineNumber::new(2).unwrap(),
324            "Match should end on line 2 (where license tokens are), not line 3"
325        );
326
327        // matched_text is computed lazily at output time, not during matching
328        assert!(
329            first_match.matched_text.is_none(),
330            "matched_text should be None during matching (computed lazily at output)"
331        );
332
333        // Verify we can compute it from the query
334        let matched_text =
335            query.matched_text(first_match.start_line.get(), first_match.end_line.get());
336        assert!(
337            matched_text.contains("license"),
338            "Computed matched text should contain 'license'"
339        );
340    }
341
342    #[test]
343    fn test_seq_match_line_numbers_partial_match() {
344        let mut index = create_seq_match_test_index();
345
346        add_test_rule(
347            &mut index,
348            "license copyright granted permission",
349            "test-license",
350        );
351
352        let text = "line one\nlicense copyright\nline three";
353        let query = Query::from_extracted_text(text, &index, false).unwrap();
354        let query_run = query.whole_query_run();
355
356        let candidates = select_seq_candidates(&index, &query_run, false, 50);
357        let matches = seq_match_with_candidates(&index, &query_run, &candidates);
358
359        assert!(!matches.is_empty(), "Should find partial matches");
360
361        let first_match = &matches[0];
362
363        assert_eq!(
364            first_match.start_line,
365            LineNumber::new(2).unwrap(),
366            "Partial match should start on line 2"
367        );
368        assert_eq!(
369            first_match.end_line,
370            LineNumber::new(2).unwrap(),
371            "Partial match should end on line 2"
372        );
373
374        assert!(
375            first_match.match_coverage < 100.0,
376            "Should be partial coverage"
377        );
378    }
379}