Skip to main content

provenant/license_detection/index/
mod.rs

1//! License index construction and querying.
2
3pub mod builder;
4pub mod dictionary;
5pub mod token_sets;
6
7// build_index is used by library tests (see spdx_lid/test.rs, index/builder/tests.rs)
8// even though the binary doesn't use it directly.
9#[allow(unused_imports)]
10pub use builder::{
11    build_index, build_index_from_loaded, build_index_from_loaded_with_automatons,
12    loaded_license_to_license, loaded_rule_to_rule,
13};
14
15use crate::license_detection::TokenSet;
16use crate::license_detection::automaton::Automaton;
17use crate::license_detection::index::dictionary::{TokenDictionary, TokenId};
18use std::collections::{HashMap, HashSet};
19
20#[derive(Debug, Clone, Default, PartialEq, Eq)]
21pub struct IndexedRuleMetadata {
22    pub license_expression_spdx: Option<String>,
23    pub skip_for_required_phrase_generation: bool,
24    pub replaced_by: Vec<String>,
25}
26
27/// License index containing all data structures for efficient license detection.
28///
29/// The LicenseIndex holds multiple index structures that enable different matching
30/// strategies: hash-based exact matching, Aho-Corasick automaton matching, set-based
31/// candidate selection, and sequence matching.
32///
33/// Based on the Python ScanCode Toolkit implementation at:
34/// reference/scancode-toolkit/src/licensedcode/index.py
35///
36/// # Index Structures
37///
38/// The index maintains several data structures for different matching strategies:
39///
40/// - **Hash matching**: `rid_by_hash` for exact hash-based matches
41/// - **Automaton matching**: `rules_automaton` and `unknown_automaton` for pattern matching
42/// - **Candidate selection**: `sets_by_rid` and `msets_by_rid` for set-based ranking
43/// - **Sequence matching**: `high_postings_by_rid` for high-value token position tracking
44/// - **Rule classification**: `false_positive_rids`, `approx_matchable_rids`
45#[derive(Debug, Clone)]
46pub struct LicenseIndex {
47    /// Token dictionary mapping token strings to integer IDs.
48    ///
49    /// IDs 0 to len_legalese-1 are reserved for legalese tokens (high-value words).
50    /// IDs len_legalese and above are assigned to other tokens as encountered.
51    pub dictionary: TokenDictionary,
52
53    /// Number of legalese tokens.
54    ///
55    /// Tokens with ID < len_legalese are considered high-value legalese words.
56    /// Tokens with ID >= len_legalese are considered low-value tokens.
57    ///
58    /// Corresponds to Python: `self.len_legalese = 0` (line 185)
59    pub len_legalese: usize,
60
61    /// Mapping from rule hash to rule ID for hash-based exact matching.
62    ///
63    /// This enables fast exact matches using a hash of the rule\'s token IDs.
64    /// Each hash maps to exactly one rule ID.
65    ///
66    /// Note: The hash is a 20-byte SHA1 digest, stored as a key in HashMap.
67    /// In practice, we use a HashMap<[u8; 20], usize>.
68    ///
69    /// Corresponds to Python: `self.rid_by_hash = {}` (line 216)
70    pub rid_by_hash: HashMap<[u8; 20], usize>,
71
72    /// Rules indexed by rule ID.
73    ///
74    /// Maps rule IDs to Rule objects for quick lookup.
75    ///
76    /// Corresponds to Python: `self.rules_by_rid = []` (line 201)
77    pub rules_by_rid: Vec<crate::license_detection::models::Rule>,
78
79    /// Token ID sequences indexed by rule ID.
80    ///
81    /// Maps rule IDs to their token ID sequences.
82    ///
83    /// Corresponds to Python: `self.tids_by_rid = []` (line 204)
84    pub tids_by_rid: Vec<Vec<TokenId>>,
85
86    /// Aho-Corasick automaton built from all rule token sequences.
87    ///
88    /// Supports efficient multi-pattern matching of token ID sequences.
89    /// Used for exact matching of complete rules or rule fragments in query text.
90    ///
91    /// Corresponds to Python: `self.rules_automaton = match_aho.get_automaton()` (line 219)
92    pub rules_automaton: Automaton,
93
94    /// Aho-Corasick automaton for unknown license detection.
95    ///
96    /// Separate automaton used to detect license-like text that doesn\'t match
97    /// any known rule. Populated with ngrams from all approx-matchable rules.
98    ///
99    /// Corresponds to Python: `self.unknown_automaton = match_unknown.get_automaton()` (line 222)
100    pub unknown_automaton: Automaton,
101
102    /// Token ID sets per rule for candidate selection.
103    ///
104    /// Maps rule IDs to sets of unique token IDs present in that rule.
105    /// Used for efficient candidate selection based on token overlap.
106    ///
107    /// Corresponds to Python: `self.sets_by_rid = []` (line 212)
108    pub sets_by_rid: HashMap<usize, TokenSet>,
109
110    pub rule_metadata_by_identifier: HashMap<String, IndexedRuleMetadata>,
111
112    /// Token ID multisets per rule for candidate ranking.
113    ///
114    /// Maps rule IDs to multisets (bags) of token IDs with their frequencies.
115    /// Used for ranking candidates by token frequency overlap.
116    ///
117    /// Corresponds to Python: `self.msets_by_rid = []` (line 213)
118    pub msets_by_rid: HashMap<usize, HashMap<TokenId, usize>>,
119
120    /// High-value token sets per rule for early candidate rejection.
121    ///
122    /// Maps rule IDs to sets containing only high-value (legalese) token IDs.
123    /// This is a subset of `sets_by_rid` for faster intersection computation
124    /// and early rejection of candidates that won't pass the high-token threshold.
125    ///
126    /// Precomputed during index building to avoid redundant filtering at runtime.
127    pub high_sets_by_rid: HashMap<usize, TokenSet>,
128
129    /// Inverted index of high-value token positions per rule.
130    ///
131    /// Maps rule IDs to a mapping from high-value token IDs to their positions
132    /// within the rule. Only contains positions for tokens with IDs < len_legalese.
133    ///
134    /// This structure speeds up sequence matching by allowing quick lookup of
135    /// where high-value tokens appear in each rule.
136    ///
137    /// Corresponds to Python: `self.high_postings_by_rid = []` (line 209)
138    /// In Python: `postings = {tid: array('h', [positions, ...])}`
139    pub high_postings_by_rid: HashMap<usize, HashMap<TokenId, Vec<usize>>>,
140
141    /// Set of rule IDs for false positive rules.
142    ///
143    /// False positive rules are used for exact matching and post-matching
144    /// filtering to subtract spurious matches.
145    ///
146    /// Corresponds to Python: `self.false_positive_rids = set()` (line 230)
147    pub false_positive_rids: HashSet<usize>,
148
149    /// Set of rule IDs that can be matched approximately.
150    ///
151    /// Only rules marked as approx-matchable participate in sequence matching.
152    /// Other rules can only be matched exactly using the automaton.
153    ///
154    /// Note: This field is kept for Python parity documentation and test usage.
155    /// The inverted index (`rids_by_high_tid`) now handles candidate filtering
156    /// more efficiently, making direct iteration over this set unnecessary.
157    ///
158    /// Corresponds to Python: `self.approx_matchable_rids = set()` (line 234)
159    #[allow(dead_code)]
160    pub approx_matchable_rids: HashSet<usize>,
161
162    /// Mapping from ScanCode license key to License object.
163    ///
164    /// Provides access to license metadata for building SPDX mappings
165    /// and validating license expressions.
166    ///
167    /// Corresponds to Python: `get_licenses_db()` in models.py
168    pub licenses_by_key: HashMap<String, crate::license_detection::models::License>,
169
170    /// Maps AhoCorasick pattern_id to rule ids (rids).
171    ///
172    /// This is needed because the AhoCorasick pattern_id is just the index
173    /// in the patterns iterator used to build the automaton, not the actual
174    /// rule id. In Python, the automaton stores (rid, start, end) tuples as
175    /// values, so the rid is retrieved from the stored value. In Rust, we
176    /// maintain this mapping instead.
177    ///
178    /// Multiple rules can share the same token pattern (e.g., rules that differ
179    /// only in license_expression). Each pattern_id maps to a list of all rule IDs
180    /// that share that pattern.
181    ///
182    /// Corresponds to Python: automaton values contain (rid, istart, iend)
183    pub pattern_id_to_rid: Vec<Vec<usize>>,
184
185    /// Mapping from SPDX license key to rule ID.
186    ///
187    /// Enables direct lookup of rules by their SPDX license key,
188    /// including aliases like "GPL-2.0+" -> gpl-2.0-plus.
189    ///
190    /// Keys are stored lowercase for case-insensitive lookup.
191    ///
192    /// Corresponds to Python: `self.licenses_by_spdx_key` in cache.py
193    pub rid_by_spdx_key: HashMap<String, usize>,
194
195    /// Rule ID for the unknown-spdx license.
196    ///
197    /// Used as a fallback when an SPDX identifier is not recognized.
198    ///
199    /// Corresponds to Python: `get_unknown_spdx_symbol()` in cache.py
200    pub unknown_spdx_rid: Option<usize>,
201
202    /// Inverted index mapping high-value token IDs to rule IDs.
203    ///
204    /// This enables fast candidate selection by only examining rules
205    /// that share at least one high-value (legalese) token with the query.
206    /// Without this index, candidate selection would iterate over all 37,000+
207    /// rules for every file, making license detection extremely slow.
208    ///
209    /// Only contains entries for tokens with ID < len_legalese (high-value tokens).
210    /// Rules not in approx_matchable_rids are excluded from this index.
211    pub rids_by_high_tid: HashMap<TokenId, HashSet<usize>>,
212}
213
214impl LicenseIndex {}
215
216impl LicenseIndex {
217    /// Create a new empty license index.
218    ///
219    /// This constructor initializes all index structures with empty collections.
220    /// The index can be populated with rules using the indexing methods (to be
221    /// implemented in future phases).
222    ///
223    /// # Returns
224    /// A new LicenseIndex instance with empty index structures
225    pub fn new(dictionary: TokenDictionary) -> Self {
226        use crate::license_detection::automaton::AutomatonBuilder;
227
228        let len_legalese = dictionary.legalese_count();
229        Self {
230            dictionary,
231            len_legalese,
232            rid_by_hash: HashMap::new(),
233            rules_by_rid: Vec::new(),
234            tids_by_rid: Vec::new(),
235            rules_automaton: AutomatonBuilder::new().build(),
236            unknown_automaton: AutomatonBuilder::new().build(),
237            sets_by_rid: HashMap::new(),
238            rule_metadata_by_identifier: HashMap::new(),
239            msets_by_rid: HashMap::new(),
240            high_sets_by_rid: HashMap::new(),
241            high_postings_by_rid: HashMap::new(),
242            false_positive_rids: HashSet::new(),
243            approx_matchable_rids: HashSet::new(),
244            licenses_by_key: HashMap::new(),
245            pattern_id_to_rid: Vec::new(),
246            rid_by_spdx_key: HashMap::new(),
247            unknown_spdx_rid: None,
248            rids_by_high_tid: HashMap::new(),
249        }
250    }
251
252    /// Create a new empty license index with the specified legalese count.
253    ///
254    /// Convenience method that creates a new TokenDictionary and LicenseIndex
255    /// in one call.
256    ///
257    /// # Arguments
258    /// * `legalese_count` - Number of reserved legalese token IDs
259    ///
260    /// # Returns
261    /// A new LicenseIndex instance with a new TokenDictionary
262    pub fn with_legalese_count(legalese_count: usize) -> Self {
263        Self::new(TokenDictionary::new(legalese_count))
264    }
265}
266
267impl Default for LicenseIndex {
268    fn default() -> Self {
269        Self::with_legalese_count(0)
270    }
271}
272
273#[cfg(test)]
274mod tests {
275    use super::*;
276
277    fn simple_license(key: &str, name: &str, spdx: &str, category: &str, text: &str) -> License {
278        License {
279            key: key.to_string(),
280            short_name: Some(name.to_string()),
281            name: name.to_string(),
282            language: Some("en".to_string()),
283            spdx_license_key: Some(spdx.to_string()),
284            other_spdx_license_keys: vec![],
285            category: Some(category.to_string()),
286            owner: None,
287            homepage_url: None,
288            text: text.to_string(),
289            reference_urls: vec![],
290            osi_license_key: Some(spdx.to_string()),
291            text_urls: vec![],
292            osi_url: None,
293            faq_url: None,
294            other_urls: vec![],
295            notes: None,
296            is_deprecated: false,
297            is_exception: false,
298            is_unknown: false,
299            is_generic: false,
300            replaced_by: vec![],
301            minimum_coverage: None,
302            standard_notice: None,
303            ignorable_copyrights: None,
304            ignorable_holders: None,
305            ignorable_authors: None,
306            ignorable_urls: None,
307            ignorable_emails: None,
308        }
309    }
310    use crate::license_detection::models::License;
311
312    #[test]
313    fn test_license_index_new() {
314        let dict = TokenDictionary::new(10);
315        let index = LicenseIndex::new(dict);
316
317        assert_eq!(index.dictionary.legalese_count(), 10);
318        assert!(index.rid_by_hash.is_empty());
319        assert!(index.sets_by_rid.is_empty());
320        assert!(index.msets_by_rid.is_empty());
321        assert!(index.high_postings_by_rid.is_empty());
322        assert!(index.false_positive_rids.is_empty());
323        assert!(index.approx_matchable_rids.is_empty());
324        assert!(index.licenses_by_key.is_empty());
325    }
326
327    #[test]
328    fn test_license_index_with_legalese_count() {
329        let index = LicenseIndex::with_legalese_count(15);
330
331        assert_eq!(index.dictionary.legalese_count(), 15);
332        assert!(index.rid_by_hash.is_empty());
333    }
334
335    #[test]
336    fn test_license_index_default() {
337        let index = LicenseIndex::default();
338
339        assert_eq!(index.dictionary.legalese_count(), 0);
340        assert!(index.rid_by_hash.is_empty());
341    }
342
343    #[test]
344    fn test_automaton_default() {
345        use crate::license_detection::automaton::AutomatonBuilder;
346
347        let automaton = AutomatonBuilder::new().build();
348        let _ = format!("{:?}", automaton);
349    }
350
351    #[test]
352    fn test_license_index_clone() {
353        let index = LicenseIndex::with_legalese_count(5);
354        let cloned = index.clone();
355
356        assert_eq!(cloned.dictionary.legalese_count(), 5);
357        assert!(cloned.rid_by_hash.is_empty());
358    }
359
360    #[test]
361    fn test_license_index_add_license() {
362        let mut index = LicenseIndex::default();
363
364        let license = simple_license(
365            "test-license",
366            "Test License",
367            "TEST",
368            "Permissive",
369            "Test license text",
370        );
371
372        index.licenses_by_key.insert(license.key.clone(), license);
373
374        assert_eq!(index.licenses_by_key.len(), 1);
375        assert!(index.licenses_by_key.contains_key("test-license"));
376    }
377
378    #[test]
379    fn test_license_index_add_licenses() {
380        let mut index = LicenseIndex::default();
381
382        let licenses = vec![
383            simple_license(
384                "license-1",
385                "License 1",
386                "LIC1",
387                "Permissive",
388                "License 1 text",
389            ),
390            simple_license(
391                "license-2",
392                "License 2",
393                "LIC2",
394                "Copyleft",
395                "License 2 text",
396            ),
397        ];
398
399        for license in licenses {
400            index.licenses_by_key.insert(license.key.clone(), license);
401        }
402
403        assert_eq!(index.licenses_by_key.len(), 2);
404        assert!(index.licenses_by_key.contains_key("license-1"));
405        assert!(index.licenses_by_key.contains_key("license-2"));
406    }
407
408    #[test]
409    fn test_license_index_get_license() {
410        let mut index = LicenseIndex::default();
411
412        let license = simple_license(
413            "mit",
414            "MIT License",
415            "MIT",
416            "Permissive",
417            "MIT License text",
418        );
419
420        index.licenses_by_key.insert(license.key.clone(), license);
421
422        let retrieved = index.licenses_by_key.get("mit");
423        assert!(retrieved.is_some());
424        assert_eq!(retrieved.unwrap().name, "MIT License");
425
426        assert!(!index.licenses_by_key.contains_key("unknown"));
427    }
428
429    #[test]
430    fn test_license_index_license_count() {
431        let mut index = LicenseIndex::default();
432
433        assert_eq!(index.licenses_by_key.len(), 0);
434
435        let license = simple_license("test", "Test", "TEST", "Permissive", "Text");
436
437        index.licenses_by_key.insert(license.key.clone(), license);
438
439        assert_eq!(index.licenses_by_key.len(), 1);
440    }
441}