Skip to main content

provenant/license_detection/index/
mod.rs

1//! License index construction and querying.
2
3pub mod builder;
4pub mod dictionary;
5pub mod token_sets;
6
7// build_index is used by library tests (see spdx_lid/test.rs, index/builder/tests.rs)
8// even though the binary doesn't use it directly.
9#[allow(unused_imports)]
10pub use builder::{
11    build_index, build_index_from_loaded, loaded_license_to_license, loaded_rule_to_rule,
12};
13
14use crate::license_detection::index::dictionary::{TokenDictionary, TokenId};
15use aho_corasick::AhoCorasick;
16use std::collections::{HashMap, HashSet};
17
18/// Type alias for Aho-Corasick automaton.
19///
20/// The automaton is built from u16 token sequences encoded as bytes.
21/// Each token is encoded as 2 bytes in little-endian format.
22///
23/// Based on the Python ScanCode Toolkit implementation at:
24/// reference/scancode-toolkit/src/licensedcode/match_aho.py
25pub type Automaton = AhoCorasick;
26
27/// License index containing all data structures for efficient license detection.
28///
29/// The LicenseIndex holds multiple index structures that enable different matching
30/// strategies: hash-based exact matching, Aho-Corasick automaton matching, set-based
31/// candidate selection, and sequence matching.
32///
33/// Based on the Python ScanCode Toolkit implementation at:
34/// reference/scancode-toolkit/src/licensedcode/index.py
35///
36/// # Index Structures
37///
38/// The index maintains several data structures for different matching strategies:
39///
40/// - **Hash matching**: `rid_by_hash` for exact hash-based matches
41/// - **Automaton matching**: `rules_automaton` and `unknown_automaton` for pattern matching
42/// - **Candidate selection**: `sets_by_rid` and `msets_by_rid` for set-based ranking
43/// - **Sequence matching**: `high_postings_by_rid` for high-value token position tracking
44/// - **Rule classification**: `false_positive_rids`, `approx_matchable_rids`
45#[derive(Debug, Clone)]
46pub struct LicenseIndex {
47    /// Token dictionary mapping token strings to integer IDs.
48    ///
49    /// IDs 0 to len_legalese-1 are reserved for legalese tokens (high-value words).
50    /// IDs len_legalese and above are assigned to other tokens as encountered.
51    pub dictionary: TokenDictionary,
52
53    /// Number of legalese tokens.
54    ///
55    /// Tokens with ID < len_legalese are considered high-value legalese words.
56    /// Tokens with ID >= len_legalese are considered low-value tokens.
57    ///
58    /// Corresponds to Python: `self.len_legalese = 0` (line 185)
59    pub len_legalese: usize,
60
61    /// Mapping from rule hash to rule ID for hash-based exact matching.
62    ///
63    /// This enables fast exact matches using a hash of the rule\'s token IDs.
64    /// Each hash maps to exactly one rule ID.
65    ///
66    /// Note: The hash is a 20-byte SHA1 digest, stored as a key in HashMap.
67    /// In practice, we use a HashMap<[u8; 20], usize>.
68    ///
69    /// Corresponds to Python: `self.rid_by_hash = {}` (line 216)
70    pub rid_by_hash: HashMap<[u8; 20], usize>,
71
72    /// Rules indexed by rule ID.
73    ///
74    /// Maps rule IDs to Rule objects for quick lookup.
75    ///
76    /// Corresponds to Python: `self.rules_by_rid = []` (line 201)
77    pub rules_by_rid: Vec<crate::license_detection::models::Rule>,
78
79    /// Token ID sequences indexed by rule ID.
80    ///
81    /// Maps rule IDs to their token ID sequences.
82    ///
83    /// Corresponds to Python: `self.tids_by_rid = []` (line 204)
84    pub tids_by_rid: Vec<Vec<TokenId>>,
85
86    /// Aho-Corasick automaton built from all rule token sequences.
87    ///
88    /// Supports efficient multi-pattern matching of token ID sequences.
89    /// Used for exact matching of complete rules or rule fragments in query text.
90    ///
91    /// Corresponds to Python: `self.rules_automaton = match_aho.get_automaton()` (line 219)
92    pub rules_automaton: Automaton,
93
94    /// Aho-Corasick automaton for unknown license detection.
95    ///
96    /// Separate automaton used to detect license-like text that doesn\'t match
97    /// any known rule. Populated with ngrams from all approx-matchable rules.
98    ///
99    /// Corresponds to Python: `self.unknown_automaton = match_unknown.get_automaton()` (line 222)
100    pub unknown_automaton: Automaton,
101
102    /// Token ID sets per rule for candidate selection.
103    ///
104    /// Maps rule IDs to sets of unique token IDs present in that rule.
105    /// Used for efficient candidate selection based on token overlap.
106    ///
107    /// Corresponds to Python: `self.sets_by_rid = []` (line 212)
108    pub sets_by_rid: HashMap<usize, HashSet<TokenId>>,
109
110    /// Token ID multisets per rule for candidate ranking.
111    ///
112    /// Maps rule IDs to multisets (bags) of token IDs with their frequencies.
113    /// Used for ranking candidates by token frequency overlap.
114    ///
115    /// Corresponds to Python: `self.msets_by_rid = []` (line 213)
116    pub msets_by_rid: HashMap<usize, HashMap<TokenId, usize>>,
117
118    /// High-value token sets per rule for early candidate rejection.
119    ///
120    /// Maps rule IDs to sets containing only high-value (legalese) token IDs.
121    /// This is a subset of `sets_by_rid` for faster intersection computation
122    /// and early rejection of candidates that won't pass the high-token threshold.
123    ///
124    /// Precomputed during index building to avoid redundant filtering at runtime.
125    pub high_sets_by_rid: HashMap<usize, HashSet<TokenId>>,
126
127    /// Inverted index of high-value token positions per rule.
128    ///
129    /// Maps rule IDs to a mapping from high-value token IDs to their positions
130    /// within the rule. Only contains positions for tokens with IDs < len_legalese.
131    ///
132    /// This structure speeds up sequence matching by allowing quick lookup of
133    /// where high-value tokens appear in each rule.
134    ///
135    /// Corresponds to Python: `self.high_postings_by_rid = []` (line 209)
136    /// In Python: `postings = {tid: array('h', [positions, ...])}`
137    pub high_postings_by_rid: HashMap<usize, HashMap<TokenId, Vec<usize>>>,
138
139    /// Set of rule IDs for false positive rules.
140    ///
141    /// False positive rules are used for exact matching and post-matching
142    /// filtering to subtract spurious matches.
143    ///
144    /// Corresponds to Python: `self.false_positive_rids = set()` (line 230)
145    pub false_positive_rids: HashSet<usize>,
146
147    /// Set of rule IDs that can be matched approximately.
148    ///
149    /// Only rules marked as approx-matchable participate in sequence matching.
150    /// Other rules can only be matched exactly using the automaton.
151    ///
152    /// Note: This field is kept for Python parity documentation and test usage.
153    /// The inverted index (`rids_by_high_tid`) now handles candidate filtering
154    /// more efficiently, making direct iteration over this set unnecessary.
155    ///
156    /// Corresponds to Python: `self.approx_matchable_rids = set()` (line 234)
157    #[allow(dead_code)]
158    pub approx_matchable_rids: HashSet<usize>,
159
160    /// Mapping from ScanCode license key to License object.
161    ///
162    /// Provides access to license metadata for building SPDX mappings
163    /// and validating license expressions.
164    ///
165    /// Corresponds to Python: `get_licenses_db()` in models.py
166    pub licenses_by_key: HashMap<String, crate::license_detection::models::License>,
167
168    /// Maps AhoCorasick pattern_id to rule id (rid).
169    ///
170    /// This is needed because the AhoCorasick pattern_id is just the index
171    /// in the patterns iterator used to build the automaton, not the actual
172    /// rule id. In Python, the automaton stores (rid, start, end) tuples as
173    /// values, so the rid is retrieved from the stored value. In Rust, we
174    /// maintain this mapping instead.
175    ///
176    /// Corresponds to Python: automaton values contain (rid, istart, iend)
177    pub pattern_id_to_rid: Vec<usize>,
178
179    /// Mapping from SPDX license key to rule ID.
180    ///
181    /// Enables direct lookup of rules by their SPDX license key,
182    /// including aliases like "GPL-2.0+" -> gpl-2.0-plus.
183    ///
184    /// Keys are stored lowercase for case-insensitive lookup.
185    ///
186    /// Corresponds to Python: `self.licenses_by_spdx_key` in cache.py
187    pub rid_by_spdx_key: HashMap<String, usize>,
188
189    /// Rule ID for the unknown-spdx license.
190    ///
191    /// Used as a fallback when an SPDX identifier is not recognized.
192    ///
193    /// Corresponds to Python: `get_unknown_spdx_symbol()` in cache.py
194    pub unknown_spdx_rid: Option<usize>,
195
196    /// Inverted index mapping high-value token IDs to rule IDs.
197    ///
198    /// This enables fast candidate selection by only examining rules
199    /// that share at least one high-value (legalese) token with the query.
200    /// Without this index, candidate selection would iterate over all 37,000+
201    /// rules for every file, making license detection extremely slow.
202    ///
203    /// Only contains entries for tokens with ID < len_legalese (high-value tokens).
204    /// Rules not in approx_matchable_rids are excluded from this index.
205    pub rids_by_high_tid: HashMap<TokenId, HashSet<usize>>,
206}
207
208impl LicenseIndex {}
209
210impl LicenseIndex {
211    /// Create a new empty license index.
212    ///
213    /// This constructor initializes all index structures with empty collections.
214    /// The index can be populated with rules using the indexing methods (to be
215    /// implemented in future phases).
216    ///
217    /// # Returns
218    /// A new LicenseIndex instance with empty index structures
219    pub fn new(dictionary: TokenDictionary) -> Self {
220        let len_legalese = dictionary.legalese_count();
221        Self {
222            dictionary,
223            len_legalese,
224            rid_by_hash: HashMap::new(),
225            rules_by_rid: Vec::new(),
226            tids_by_rid: Vec::new(),
227            rules_automaton: Automaton::new(std::iter::empty::<&[u8]>())
228                .expect("Failed to create empty automaton"),
229            unknown_automaton: Automaton::new(std::iter::empty::<&[u8]>())
230                .expect("Failed to create empty automaton"),
231            sets_by_rid: HashMap::new(),
232            msets_by_rid: HashMap::new(),
233            high_sets_by_rid: HashMap::new(),
234            high_postings_by_rid: HashMap::new(),
235            false_positive_rids: HashSet::new(),
236            approx_matchable_rids: HashSet::new(),
237            licenses_by_key: HashMap::new(),
238            pattern_id_to_rid: Vec::new(),
239            rid_by_spdx_key: HashMap::new(),
240            unknown_spdx_rid: None,
241            rids_by_high_tid: HashMap::new(),
242        }
243    }
244
245    /// Create a new empty license index with the specified legalese count.
246    ///
247    /// Convenience method that creates a new TokenDictionary and LicenseIndex
248    /// in one call.
249    ///
250    /// # Arguments
251    /// * `legalese_count` - Number of reserved legalese token IDs
252    ///
253    /// # Returns
254    /// A new LicenseIndex instance with a new TokenDictionary
255    pub fn with_legalese_count(legalese_count: usize) -> Self {
256        Self::new(TokenDictionary::new(legalese_count))
257    }
258}
259
260impl Default for LicenseIndex {
261    fn default() -> Self {
262        Self::with_legalese_count(0)
263    }
264}
265
266#[cfg(test)]
267mod tests {
268    use super::*;
269    use crate::license_detection::models::License;
270
271    #[test]
272    fn test_license_index_new() {
273        let dict = TokenDictionary::new(10);
274        let index = LicenseIndex::new(dict);
275
276        assert_eq!(index.dictionary.legalese_count(), 10);
277        assert!(index.rid_by_hash.is_empty());
278        assert!(index.sets_by_rid.is_empty());
279        assert!(index.msets_by_rid.is_empty());
280        assert!(index.high_postings_by_rid.is_empty());
281        assert!(index.false_positive_rids.is_empty());
282        assert!(index.approx_matchable_rids.is_empty());
283        assert!(index.licenses_by_key.is_empty());
284    }
285
286    #[test]
287    fn test_license_index_with_legalese_count() {
288        let index = LicenseIndex::with_legalese_count(15);
289
290        assert_eq!(index.dictionary.legalese_count(), 15);
291        assert!(index.rid_by_hash.is_empty());
292    }
293
294    #[test]
295    fn test_license_index_default() {
296        let index = LicenseIndex::default();
297
298        assert_eq!(index.dictionary.legalese_count(), 0);
299        assert!(index.rid_by_hash.is_empty());
300    }
301
302    #[test]
303    fn test_automaton_default() {
304        let automaton =
305            Automaton::new(std::iter::empty::<&[u8]>()).expect("Failed to create automaton");
306        let _ = format!("{:?}", automaton);
307    }
308
309    #[test]
310    fn test_license_index_clone() {
311        let index = LicenseIndex::with_legalese_count(5);
312        let cloned = index.clone();
313
314        assert_eq!(cloned.dictionary.legalese_count(), 5);
315        assert!(cloned.rid_by_hash.is_empty());
316    }
317
318    #[test]
319    fn test_license_index_add_license() {
320        let mut index = LicenseIndex::default();
321
322        let license = License {
323            key: "test-license".to_string(),
324            name: "Test License".to_string(),
325            spdx_license_key: Some("TEST".to_string()),
326            other_spdx_license_keys: vec![],
327            category: Some("Permissive".to_string()),
328            text: "Test license text".to_string(),
329            reference_urls: vec![],
330            notes: None,
331            is_deprecated: false,
332            replaced_by: vec![],
333            minimum_coverage: None,
334            ignorable_copyrights: None,
335            ignorable_holders: None,
336            ignorable_authors: None,
337            ignorable_urls: None,
338            ignorable_emails: None,
339        };
340
341        index.licenses_by_key.insert(license.key.clone(), license);
342
343        assert_eq!(index.licenses_by_key.len(), 1);
344        assert!(index.licenses_by_key.contains_key("test-license"));
345    }
346
347    #[test]
348    fn test_license_index_add_licenses() {
349        let mut index = LicenseIndex::default();
350
351        let licenses = vec![
352            License {
353                key: "license-1".to_string(),
354                name: "License 1".to_string(),
355                spdx_license_key: Some("LIC1".to_string()),
356                other_spdx_license_keys: vec![],
357                category: Some("Permissive".to_string()),
358                text: "License 1 text".to_string(),
359                reference_urls: vec![],
360                notes: None,
361                is_deprecated: false,
362                replaced_by: vec![],
363                minimum_coverage: None,
364                ignorable_copyrights: None,
365                ignorable_holders: None,
366                ignorable_authors: None,
367                ignorable_urls: None,
368                ignorable_emails: None,
369            },
370            License {
371                key: "license-2".to_string(),
372                name: "License 2".to_string(),
373                spdx_license_key: Some("LIC2".to_string()),
374                other_spdx_license_keys: vec![],
375                category: Some("Copyleft".to_string()),
376                text: "License 2 text".to_string(),
377                reference_urls: vec![],
378                notes: None,
379                is_deprecated: false,
380                replaced_by: vec![],
381                minimum_coverage: None,
382                ignorable_copyrights: None,
383                ignorable_holders: None,
384                ignorable_authors: None,
385                ignorable_urls: None,
386                ignorable_emails: None,
387            },
388        ];
389
390        for license in licenses {
391            index.licenses_by_key.insert(license.key.clone(), license);
392        }
393
394        assert_eq!(index.licenses_by_key.len(), 2);
395        assert!(index.licenses_by_key.contains_key("license-1"));
396        assert!(index.licenses_by_key.contains_key("license-2"));
397    }
398
399    #[test]
400    fn test_license_index_get_license() {
401        let mut index = LicenseIndex::default();
402
403        let license = License {
404            key: "mit".to_string(),
405            name: "MIT License".to_string(),
406            spdx_license_key: Some("MIT".to_string()),
407            other_spdx_license_keys: vec![],
408            category: Some("Permissive".to_string()),
409            text: "MIT License text".to_string(),
410            reference_urls: vec![],
411            notes: None,
412            is_deprecated: false,
413            replaced_by: vec![],
414            minimum_coverage: None,
415            ignorable_copyrights: None,
416            ignorable_holders: None,
417            ignorable_authors: None,
418            ignorable_urls: None,
419            ignorable_emails: None,
420        };
421
422        index.licenses_by_key.insert(license.key.clone(), license);
423
424        let retrieved = index.licenses_by_key.get("mit");
425        assert!(retrieved.is_some());
426        assert_eq!(retrieved.unwrap().name, "MIT License");
427
428        assert!(!index.licenses_by_key.contains_key("unknown"));
429    }
430
431    #[test]
432    fn test_license_index_license_count() {
433        let mut index = LicenseIndex::default();
434
435        assert_eq!(index.licenses_by_key.len(), 0);
436
437        let license = License {
438            key: "test".to_string(),
439            name: "Test".to_string(),
440            spdx_license_key: Some("TEST".to_string()),
441            other_spdx_license_keys: vec![],
442            category: Some("Permissive".to_string()),
443            text: "Text".to_string(),
444            reference_urls: vec![],
445            notes: None,
446            is_deprecated: false,
447            replaced_by: vec![],
448            minimum_coverage: None,
449            ignorable_copyrights: None,
450            ignorable_holders: None,
451            ignorable_authors: None,
452            ignorable_urls: None,
453            ignorable_emails: None,
454        };
455
456        index.licenses_by_key.insert(license.key.clone(), license);
457
458        assert_eq!(index.licenses_by_key.len(), 1);
459    }
460}