Skip to main content

provenant/license_detection/index/
mod.rs

1//! License index construction and querying.
2
3pub mod builder;
4pub mod dictionary;
5
6// build_index is used by library tests (see spdx_lid/test.rs, index/builder/tests.rs)
7// even though the binary doesn't use it directly.
8#[allow(unused_imports)]
9pub use builder::{
10    build_index, build_index_from_loaded, build_index_from_loaded_with_automatons,
11    loaded_license_to_license, loaded_rule_to_rule,
12};
13
14use crate::license_detection::automaton::Automaton;
15use crate::license_detection::index::dictionary::{TokenDictionary, TokenId};
16use crate::license_detection::{TokenMultiset, TokenSet};
17use std::collections::{HashMap, HashSet};
18
19#[derive(Debug, Clone, Default, PartialEq, Eq)]
20pub struct IndexedRuleMetadata {
21    pub license_expression_spdx: Option<String>,
22    pub skip_for_required_phrase_generation: bool,
23    pub replaced_by: Vec<String>,
24}
25
26/// License index containing all data structures for efficient license detection.
27///
28/// The LicenseIndex holds multiple index structures that enable different matching
29/// strategies: hash-based exact matching, Aho-Corasick automaton matching, set-based
30/// candidate selection, and sequence matching.
31///
32/// Based on the Python ScanCode Toolkit implementation at:
33/// reference/scancode-toolkit/src/licensedcode/index.py
34///
35/// # Index Structures
36///
37/// The index maintains several data structures for different matching strategies:
38///
39/// - **Hash matching**: `rid_by_hash` for exact hash-based matches
40/// - **Automaton matching**: `rules_automaton` and `unknown_automaton` for pattern matching
41/// - **Candidate selection**: `sets_by_rid` and `msets_by_rid` for set-based ranking
42/// - **Sequence matching**: `high_postings_by_rid` for high-value token position tracking
43/// - **Rule classification**: `false_positive_rids`, `approx_matchable_rids`
44#[derive(Debug, Clone)]
45pub struct LicenseIndex {
46    /// Token dictionary mapping token strings to integer IDs.
47    ///
48    /// IDs 0 to len_legalese-1 are reserved for legalese tokens (high-value words).
49    /// IDs len_legalese and above are assigned to other tokens as encountered.
50    pub dictionary: TokenDictionary,
51
52    /// Number of legalese tokens.
53    ///
54    /// Tokens with ID < len_legalese are considered high-value legalese words.
55    /// Tokens with ID >= len_legalese are considered low-value tokens.
56    ///
57    /// Corresponds to Python: `self.len_legalese = 0` (line 185)
58    pub len_legalese: usize,
59
60    /// Mapping from rule hash to rule ID for hash-based exact matching.
61    ///
62    /// This enables fast exact matches using a hash of the rule\'s token IDs.
63    /// Each hash maps to exactly one rule ID.
64    ///
65    /// Note: The hash is a 20-byte SHA1 digest, stored as a key in HashMap.
66    /// In practice, we use a HashMap<[u8; 20], usize>.
67    ///
68    /// Corresponds to Python: `self.rid_by_hash = {}` (line 216)
69    pub rid_by_hash: HashMap<[u8; 20], usize>,
70
71    /// Rules indexed by rule ID.
72    ///
73    /// Maps rule IDs to Rule objects for quick lookup.
74    ///
75    /// Corresponds to Python: `self.rules_by_rid = []` (line 201)
76    pub rules_by_rid: Vec<crate::license_detection::models::Rule>,
77
78    /// Token ID sequences indexed by rule ID.
79    ///
80    /// Maps rule IDs to their token ID sequences.
81    ///
82    /// Corresponds to Python: `self.tids_by_rid = []` (line 204)
83    pub tids_by_rid: Vec<Vec<TokenId>>,
84
85    /// Aho-Corasick automaton built from all rule token sequences.
86    ///
87    /// Supports efficient multi-pattern matching of token ID sequences.
88    /// Used for exact matching of complete rules or rule fragments in query text.
89    ///
90    /// Corresponds to Python: `self.rules_automaton = match_aho.get_automaton()` (line 219)
91    pub rules_automaton: Automaton,
92
93    /// Aho-Corasick automaton for unknown license detection.
94    ///
95    /// Separate automaton used to detect license-like text that doesn\'t match
96    /// any known rule. Populated with ngrams from all approx-matchable rules.
97    ///
98    /// Corresponds to Python: `self.unknown_automaton = match_unknown.get_automaton()` (line 222)
99    pub unknown_automaton: Automaton,
100
101    /// Token ID sets per rule for candidate selection.
102    ///
103    /// Maps rule IDs to sets of unique token IDs present in that rule.
104    /// Used for efficient candidate selection based on token overlap.
105    ///
106    /// Corresponds to Python: `self.sets_by_rid = []` (line 212)
107    pub sets_by_rid: HashMap<usize, TokenSet>,
108
109    pub rule_metadata_by_identifier: HashMap<String, IndexedRuleMetadata>,
110
111    /// Token ID multisets per rule for candidate ranking.
112    ///
113    /// Maps rule IDs to multisets (bags) of token IDs with their frequencies.
114    /// Used for ranking candidates by token frequency overlap.
115    ///
116    /// Corresponds to Python: `self.msets_by_rid = []` (line 213)
117    pub msets_by_rid: HashMap<usize, TokenMultiset>,
118
119    /// High-value token sets per rule for early candidate rejection.
120    ///
121    /// Maps rule IDs to sets containing only high-value (legalese) token IDs.
122    /// This is a subset of `sets_by_rid` for faster intersection computation
123    /// and early rejection of candidates that won't pass the high-token threshold.
124    ///
125    /// Precomputed during index building to avoid redundant filtering at runtime.
126    pub high_sets_by_rid: HashMap<usize, TokenSet>,
127
128    /// Inverted index of high-value token positions per rule.
129    ///
130    /// Maps rule IDs to a mapping from high-value token IDs to their positions
131    /// within the rule. Only contains positions for tokens with IDs < len_legalese.
132    ///
133    /// This structure speeds up sequence matching by allowing quick lookup of
134    /// where high-value tokens appear in each rule.
135    ///
136    /// Corresponds to Python: `self.high_postings_by_rid = []` (line 209)
137    /// In Python: `postings = {tid: array('h', [positions, ...])}`
138    pub high_postings_by_rid: HashMap<usize, HashMap<TokenId, Vec<usize>>>,
139
140    /// Set of rule IDs for false positive rules.
141    ///
142    /// False positive rules are used for exact matching and post-matching
143    /// filtering to subtract spurious matches.
144    ///
145    /// Corresponds to Python: `self.false_positive_rids = set()` (line 230)
146    pub false_positive_rids: HashSet<usize>,
147
148    /// Set of rule IDs that can be matched approximately.
149    ///
150    /// Only rules marked as approx-matchable participate in sequence matching.
151    /// Other rules can only be matched exactly using the automaton.
152    ///
153    /// Note: This field is kept for Python parity documentation and test usage.
154    /// The inverted index (`rids_by_high_tid`) now handles candidate filtering
155    /// more efficiently, making direct iteration over this set unnecessary.
156    ///
157    /// Corresponds to Python: `self.approx_matchable_rids = set()` (line 234)
158    #[allow(dead_code)]
159    pub approx_matchable_rids: HashSet<usize>,
160
161    /// Mapping from ScanCode license key to License object.
162    ///
163    /// Provides access to license metadata for building SPDX mappings
164    /// and validating license expressions.
165    ///
166    /// Corresponds to Python: `get_licenses_db()` in models.py
167    pub licenses_by_key: HashMap<String, crate::license_detection::models::License>,
168
169    /// Maps AhoCorasick pattern_id to rule ids (rids).
170    ///
171    /// This is needed because the AhoCorasick pattern_id is just the index
172    /// in the patterns iterator used to build the automaton, not the actual
173    /// rule id. In Python, the automaton stores (rid, start, end) tuples as
174    /// values, so the rid is retrieved from the stored value. In Rust, we
175    /// maintain this mapping instead.
176    ///
177    /// Multiple rules can share the same token pattern (e.g., rules that differ
178    /// only in license_expression). Each pattern_id maps to a list of all rule IDs
179    /// that share that pattern.
180    ///
181    /// Corresponds to Python: automaton values contain (rid, istart, iend)
182    pub pattern_id_to_rid: Vec<Vec<usize>>,
183
184    /// Mapping from SPDX license key to rule ID.
185    ///
186    /// Enables direct lookup of rules by their SPDX license key,
187    /// including aliases like "GPL-2.0+" -> gpl-2.0-plus.
188    ///
189    /// Keys are stored lowercase for case-insensitive lookup.
190    ///
191    /// Corresponds to Python: `self.licenses_by_spdx_key` in cache.py
192    pub rid_by_spdx_key: HashMap<String, usize>,
193
194    /// Rule ID for the unknown-spdx license.
195    ///
196    /// Used as a fallback when an SPDX identifier is not recognized.
197    ///
198    /// Corresponds to Python: `get_unknown_spdx_symbol()` in cache.py
199    pub unknown_spdx_rid: Option<usize>,
200
201    /// Inverted index mapping high-value token IDs to rule IDs.
202    ///
203    /// This enables fast candidate selection by only examining rules
204    /// that share at least one high-value (legalese) token with the query.
205    /// Without this index, candidate selection would iterate over all 37,000+
206    /// rules for every file, making license detection extremely slow.
207    ///
208    /// Only contains entries for tokens with ID < len_legalese (high-value tokens).
209    /// Rules not in approx_matchable_rids are excluded from this index.
210    pub rids_by_high_tid: HashMap<TokenId, HashSet<usize>>,
211}
212
213impl LicenseIndex {}
214
215impl LicenseIndex {
216    /// Create a new empty license index.
217    ///
218    /// This constructor initializes all index structures with empty collections.
219    /// The index can be populated with rules using the indexing methods (to be
220    /// implemented in future phases).
221    ///
222    /// # Returns
223    /// A new LicenseIndex instance with empty index structures
224    pub fn new(dictionary: TokenDictionary) -> Self {
225        use crate::license_detection::automaton::AutomatonBuilder;
226
227        let len_legalese = dictionary.legalese_count();
228        Self {
229            dictionary,
230            len_legalese,
231            rid_by_hash: HashMap::new(),
232            rules_by_rid: Vec::new(),
233            tids_by_rid: Vec::new(),
234            rules_automaton: AutomatonBuilder::new().build(),
235            unknown_automaton: AutomatonBuilder::new().build(),
236            sets_by_rid: HashMap::new(),
237            rule_metadata_by_identifier: HashMap::new(),
238            msets_by_rid: HashMap::new(),
239            high_sets_by_rid: HashMap::new(),
240            high_postings_by_rid: HashMap::new(),
241            false_positive_rids: HashSet::new(),
242            approx_matchable_rids: HashSet::new(),
243            licenses_by_key: HashMap::new(),
244            pattern_id_to_rid: Vec::new(),
245            rid_by_spdx_key: HashMap::new(),
246            unknown_spdx_rid: None,
247            rids_by_high_tid: HashMap::new(),
248        }
249    }
250
251    /// Create a new empty license index with the specified legalese count.
252    ///
253    /// Convenience method that creates a new TokenDictionary and LicenseIndex
254    /// in one call.
255    ///
256    /// # Arguments
257    /// * `legalese_count` - Number of reserved legalese token IDs
258    ///
259    /// # Returns
260    /// A new LicenseIndex instance with a new TokenDictionary
261    pub fn with_legalese_count(legalese_count: usize) -> Self {
262        Self::new(TokenDictionary::new(legalese_count))
263    }
264}
265
266impl Default for LicenseIndex {
267    fn default() -> Self {
268        Self::with_legalese_count(0)
269    }
270}
271
272#[cfg(test)]
273mod tests {
274    use super::*;
275
276    fn simple_license(key: &str, name: &str, spdx: &str, category: &str, text: &str) -> License {
277        License {
278            key: key.to_string(),
279            short_name: Some(name.to_string()),
280            name: name.to_string(),
281            language: Some("en".to_string()),
282            spdx_license_key: Some(spdx.to_string()),
283            other_spdx_license_keys: vec![],
284            category: Some(category.to_string()),
285            owner: None,
286            homepage_url: None,
287            text: text.to_string(),
288            reference_urls: vec![],
289            osi_license_key: Some(spdx.to_string()),
290            text_urls: vec![],
291            osi_url: None,
292            faq_url: None,
293            other_urls: vec![],
294            notes: None,
295            is_deprecated: false,
296            is_exception: false,
297            is_unknown: false,
298            is_generic: false,
299            replaced_by: vec![],
300            minimum_coverage: None,
301            standard_notice: None,
302            ignorable_copyrights: None,
303            ignorable_holders: None,
304            ignorable_authors: None,
305            ignorable_urls: None,
306            ignorable_emails: None,
307        }
308    }
309    use crate::license_detection::models::License;
310
311    #[test]
312    fn test_license_index_new() {
313        let dict = TokenDictionary::new(10);
314        let index = LicenseIndex::new(dict);
315
316        assert_eq!(index.dictionary.legalese_count(), 10);
317        assert!(index.rid_by_hash.is_empty());
318        assert!(index.sets_by_rid.is_empty());
319        assert!(index.msets_by_rid.is_empty());
320        assert!(index.high_postings_by_rid.is_empty());
321        assert!(index.false_positive_rids.is_empty());
322        assert!(index.approx_matchable_rids.is_empty());
323        assert!(index.licenses_by_key.is_empty());
324    }
325
326    #[test]
327    fn test_license_index_with_legalese_count() {
328        let index = LicenseIndex::with_legalese_count(15);
329
330        assert_eq!(index.dictionary.legalese_count(), 15);
331        assert!(index.rid_by_hash.is_empty());
332    }
333
334    #[test]
335    fn test_license_index_default() {
336        let index = LicenseIndex::default();
337
338        assert_eq!(index.dictionary.legalese_count(), 0);
339        assert!(index.rid_by_hash.is_empty());
340    }
341
342    #[test]
343    fn test_automaton_default() {
344        use crate::license_detection::automaton::AutomatonBuilder;
345
346        let automaton = AutomatonBuilder::new().build();
347        let _ = format!("{:?}", automaton);
348    }
349
350    #[test]
351    fn test_license_index_clone() {
352        let index = LicenseIndex::with_legalese_count(5);
353        let cloned = index.clone();
354
355        assert_eq!(cloned.dictionary.legalese_count(), 5);
356        assert!(cloned.rid_by_hash.is_empty());
357    }
358
359    #[test]
360    fn test_license_index_add_license() {
361        let mut index = LicenseIndex::default();
362
363        let license = simple_license(
364            "test-license",
365            "Test License",
366            "TEST",
367            "Permissive",
368            "Test license text",
369        );
370
371        index.licenses_by_key.insert(license.key.clone(), license);
372
373        assert_eq!(index.licenses_by_key.len(), 1);
374        assert!(index.licenses_by_key.contains_key("test-license"));
375    }
376
377    #[test]
378    fn test_license_index_add_licenses() {
379        let mut index = LicenseIndex::default();
380
381        let licenses = vec![
382            simple_license(
383                "license-1",
384                "License 1",
385                "LIC1",
386                "Permissive",
387                "License 1 text",
388            ),
389            simple_license(
390                "license-2",
391                "License 2",
392                "LIC2",
393                "Copyleft",
394                "License 2 text",
395            ),
396        ];
397
398        for license in licenses {
399            index.licenses_by_key.insert(license.key.clone(), license);
400        }
401
402        assert_eq!(index.licenses_by_key.len(), 2);
403        assert!(index.licenses_by_key.contains_key("license-1"));
404        assert!(index.licenses_by_key.contains_key("license-2"));
405    }
406
407    #[test]
408    fn test_license_index_get_license() {
409        let mut index = LicenseIndex::default();
410
411        let license = simple_license(
412            "mit",
413            "MIT License",
414            "MIT",
415            "Permissive",
416            "MIT License text",
417        );
418
419        index.licenses_by_key.insert(license.key.clone(), license);
420
421        let retrieved = index.licenses_by_key.get("mit");
422        assert!(retrieved.is_some());
423        assert_eq!(retrieved.unwrap().name, "MIT License");
424
425        assert!(!index.licenses_by_key.contains_key("unknown"));
426    }
427
428    #[test]
429    fn test_license_index_license_count() {
430        let mut index = LicenseIndex::default();
431
432        assert_eq!(index.licenses_by_key.len(), 0);
433
434        let license = simple_license("test", "Test", "TEST", "Permissive", "Text");
435
436        index.licenses_by_key.insert(license.key.clone(), license);
437
438        assert_eq!(index.licenses_by_key.len(), 1);
439    }
440}