Skip to main content

provenant/license_detection/index/
mod.rs

1//! License index construction and querying.
2
3pub mod builder;
4pub mod dictionary;
5
6// build_index is used by library tests (see spdx_lid/test.rs, index/builder/tests.rs)
7// even though the binary doesn't use it directly.
8#[allow(unused_imports)]
9pub use builder::{
10    build_index, build_index_from_loaded, build_index_from_loaded_with_automatons,
11    loaded_license_to_license, loaded_rule_to_rule,
12};
13
14use crate::license_detection::automaton::Automaton;
15use crate::license_detection::index::dictionary::{TokenDictionary, TokenId};
16use crate::license_detection::{TokenMultiset, TokenSet};
17use rkyv::Archive;
18use std::collections::{HashMap, HashSet};
19
20#[derive(Debug, Clone, Default, PartialEq, Eq, Archive, rkyv::Serialize, rkyv::Deserialize)]
21pub struct IndexedRuleMetadata {
22    pub license_expression_spdx: Option<String>,
23    pub skip_for_required_phrase_generation: bool,
24    pub replaced_by: Vec<String>,
25}
26
27/// Rkyv-compatible cache representation of LicenseIndex.
28///
29/// Automaton fields and complex model types are stored as serialized byte
30/// vectors since they have custom serde logic that isn't compatible with
31/// rkyv's derive macros.
32#[derive(Debug, Archive, rkyv::Serialize, rkyv::Deserialize)]
33pub struct CachedLicenseIndex {
34    pub dictionary: TokenDictionary,
35    pub len_legalese: usize,
36    pub rid_by_hash: HashMap<[u8; 20], usize>,
37    pub rules_by_rid_bytes: Vec<u8>,
38    pub tids_by_rid: Vec<Vec<TokenId>>,
39    pub rules_automaton_bytes: Vec<u8>,
40    pub unknown_automaton_bytes: Vec<u8>,
41    pub sets_by_rid: HashMap<usize, TokenSet>,
42    pub rule_metadata_by_identifier: HashMap<String, IndexedRuleMetadata>,
43    pub msets_by_rid: HashMap<usize, TokenMultiset>,
44    pub high_sets_by_rid: HashMap<usize, TokenSet>,
45    pub high_postings_by_rid: HashMap<usize, HashMap<TokenId, Vec<usize>>>,
46    pub false_positive_rids: HashSet<usize>,
47    pub approx_matchable_rids: HashSet<usize>,
48    pub licenses_by_key_bytes: Vec<u8>,
49    pub pattern_id_to_rid: Vec<Vec<usize>>,
50    pub rid_by_spdx_key: HashMap<String, usize>,
51    pub unknown_spdx_rid: Option<usize>,
52    pub rids_by_high_tid: HashMap<TokenId, HashSet<usize>>,
53    pub spdx_license_list_version: Option<String>,
54}
55
56impl From<LicenseIndex> for CachedLicenseIndex {
57    fn from(index: LicenseIndex) -> Self {
58        Self {
59            dictionary: index.dictionary,
60            len_legalese: index.len_legalese,
61            rid_by_hash: index.rid_by_hash,
62            rules_by_rid_bytes: rmp_serde::to_vec(&index.rules_by_rid).unwrap(),
63            tids_by_rid: index.tids_by_rid,
64            rules_automaton_bytes: index.rules_automaton.serialize_bytes(),
65            unknown_automaton_bytes: index.unknown_automaton.serialize_bytes(),
66            sets_by_rid: index.sets_by_rid,
67            rule_metadata_by_identifier: index.rule_metadata_by_identifier,
68            msets_by_rid: index.msets_by_rid,
69            high_sets_by_rid: index.high_sets_by_rid,
70            high_postings_by_rid: index.high_postings_by_rid,
71            false_positive_rids: index.false_positive_rids,
72            approx_matchable_rids: index.approx_matchable_rids,
73            licenses_by_key_bytes: rmp_serde::to_vec(&index.licenses_by_key).unwrap(),
74            pattern_id_to_rid: index.pattern_id_to_rid,
75            rid_by_spdx_key: index.rid_by_spdx_key,
76            unknown_spdx_rid: index.unknown_spdx_rid,
77            rids_by_high_tid: index.rids_by_high_tid,
78            spdx_license_list_version: None,
79        }
80    }
81}
82
83impl From<CachedLicenseIndex> for LicenseIndex {
84    fn from(cached: CachedLicenseIndex) -> Self {
85        Self {
86            dictionary: cached.dictionary,
87            len_legalese: cached.len_legalese,
88            rid_by_hash: cached.rid_by_hash,
89            rules_by_rid: rmp_serde::from_slice(&cached.rules_by_rid_bytes).unwrap(),
90            tids_by_rid: cached.tids_by_rid,
91            rules_automaton: Automaton::deserialize_unchecked(&cached.rules_automaton_bytes),
92            unknown_automaton: Automaton::deserialize_unchecked(&cached.unknown_automaton_bytes),
93            sets_by_rid: cached.sets_by_rid,
94            rule_metadata_by_identifier: cached.rule_metadata_by_identifier,
95            msets_by_rid: cached.msets_by_rid,
96            high_sets_by_rid: cached.high_sets_by_rid,
97            high_postings_by_rid: cached.high_postings_by_rid,
98            false_positive_rids: cached.false_positive_rids,
99            approx_matchable_rids: cached.approx_matchable_rids,
100            licenses_by_key: rmp_serde::from_slice(&cached.licenses_by_key_bytes).unwrap(),
101            pattern_id_to_rid: cached.pattern_id_to_rid,
102            rid_by_spdx_key: cached.rid_by_spdx_key,
103            unknown_spdx_rid: cached.unknown_spdx_rid,
104            rids_by_high_tid: cached.rids_by_high_tid,
105        }
106    }
107}
108
109/// License index containing all data structures for efficient license detection.
110///
111/// The LicenseIndex holds multiple index structures that enable different matching
112/// strategies: hash-based exact matching, Aho-Corasick automaton matching, set-based
113/// candidate selection, and sequence matching.
114///
115/// Based on the Python ScanCode Toolkit implementation at:
116/// reference/scancode-toolkit/src/licensedcode/index.py
117///
118/// # Index Structures
119///
120/// The index maintains several data structures for different matching strategies:
121///
122/// - **Hash matching**: `rid_by_hash` for exact hash-based matches
123/// - **Automaton matching**: `rules_automaton` and `unknown_automaton` for pattern matching
124/// - **Candidate selection**: `sets_by_rid` and `msets_by_rid` for set-based ranking
125/// - **Sequence matching**: `high_postings_by_rid` for high-value token position tracking
126/// - **Rule classification**: `false_positive_rids`, `approx_matchable_rids`
127#[derive(Debug, Clone)]
128pub struct LicenseIndex {
129    /// Token dictionary mapping token strings to integer IDs.
130    ///
131    /// IDs 0 to len_legalese-1 are reserved for legalese tokens (high-value words).
132    /// IDs len_legalese and above are assigned to other tokens as encountered.
133    pub dictionary: TokenDictionary,
134
135    /// Number of legalese tokens.
136    ///
137    /// Tokens with ID < len_legalese are considered high-value legalese words.
138    /// Tokens with ID >= len_legalese are considered low-value tokens.
139    ///
140    /// Corresponds to Python: `self.len_legalese = 0` (line 185)
141    pub len_legalese: usize,
142
143    /// Mapping from rule hash to rule ID for hash-based exact matching.
144    ///
145    /// This enables fast exact matches using a hash of the rule\'s token IDs.
146    /// Each hash maps to exactly one rule ID.
147    ///
148    /// Note: The hash is a 20-byte SHA1 digest, stored as a key in HashMap.
149    /// In practice, we use a HashMap<[u8; 20], usize>.
150    ///
151    /// Corresponds to Python: `self.rid_by_hash = {}` (line 216)
152    pub rid_by_hash: HashMap<[u8; 20], usize>,
153
154    /// Rules indexed by rule ID.
155    ///
156    /// Maps rule IDs to Rule objects for quick lookup.
157    ///
158    /// Corresponds to Python: `self.rules_by_rid = []` (line 201)
159    pub rules_by_rid: Vec<crate::license_detection::models::Rule>,
160
161    /// Token ID sequences indexed by rule ID.
162    ///
163    /// Maps rule IDs to their token ID sequences.
164    ///
165    /// Corresponds to Python: `self.tids_by_rid = []` (line 204)
166    pub tids_by_rid: Vec<Vec<TokenId>>,
167
168    /// Aho-Corasick automaton built from all rule token sequences.
169    ///
170    /// Supports efficient multi-pattern matching of token ID sequences.
171    /// Used for exact matching of complete rules or rule fragments in query text.
172    ///
173    /// Corresponds to Python: `self.rules_automaton = match_aho.get_automaton()` (line 219)
174    pub rules_automaton: Automaton,
175
176    /// Aho-Corasick automaton for unknown license detection.
177    ///
178    /// Separate automaton used to detect license-like text that doesn\'t match
179    /// any known rule. Populated with ngrams from all approx-matchable rules.
180    ///
181    /// Corresponds to Python: `self.unknown_automaton = match_unknown.get_automaton()` (line 222)
182    pub unknown_automaton: Automaton,
183
184    /// Token ID sets per rule for candidate selection.
185    ///
186    /// Maps rule IDs to sets of unique token IDs present in that rule.
187    /// Used for efficient candidate selection based on token overlap.
188    ///
189    /// Corresponds to Python: `self.sets_by_rid = []` (line 212)
190    pub sets_by_rid: HashMap<usize, TokenSet>,
191
192    pub rule_metadata_by_identifier: HashMap<String, IndexedRuleMetadata>,
193
194    /// Token ID multisets per rule for candidate ranking.
195    ///
196    /// Maps rule IDs to multisets (bags) of token IDs with their frequencies.
197    /// Used for ranking candidates by token frequency overlap.
198    ///
199    /// Corresponds to Python: `self.msets_by_rid = []` (line 213)
200    pub msets_by_rid: HashMap<usize, TokenMultiset>,
201
202    /// High-value token sets per rule for early candidate rejection.
203    ///
204    /// Maps rule IDs to sets containing only high-value (legalese) token IDs.
205    /// This is a subset of `sets_by_rid` for faster intersection computation
206    /// and early rejection of candidates that won't pass the high-token threshold.
207    ///
208    /// Precomputed during index building to avoid redundant filtering at runtime.
209    pub high_sets_by_rid: HashMap<usize, TokenSet>,
210
211    /// Inverted index of high-value token positions per rule.
212    ///
213    /// Maps rule IDs to a mapping from high-value token IDs to their positions
214    /// within the rule. Only contains positions for tokens with IDs < len_legalese.
215    ///
216    /// This structure speeds up sequence matching by allowing quick lookup of
217    /// where high-value tokens appear in each rule.
218    ///
219    /// Corresponds to Python: `self.high_postings_by_rid = []` (line 209)
220    /// In Python: `postings = {tid: array('h', [positions, ...])}`
221    pub high_postings_by_rid: HashMap<usize, HashMap<TokenId, Vec<usize>>>,
222
223    /// Set of rule IDs for false positive rules.
224    ///
225    /// False positive rules are used for exact matching and post-matching
226    /// filtering to subtract spurious matches.
227    ///
228    /// Corresponds to Python: `self.false_positive_rids = set()` (line 230)
229    pub false_positive_rids: HashSet<usize>,
230
231    /// Set of rule IDs that can be matched approximately.
232    ///
233    /// Only rules marked as approx-matchable participate in sequence matching.
234    /// Other rules can only be matched exactly using the automaton.
235    ///
236    /// Note: This field is kept for Python parity documentation and test usage.
237    /// The inverted index (`rids_by_high_tid`) now handles candidate filtering
238    /// more efficiently, making direct iteration over this set unnecessary.
239    ///
240    /// Corresponds to Python: `self.approx_matchable_rids = set()` (line 234)
241    #[allow(dead_code)]
242    pub approx_matchable_rids: HashSet<usize>,
243
244    /// Mapping from ScanCode license key to License object.
245    ///
246    /// Provides access to license metadata for building SPDX mappings
247    /// and validating license expressions.
248    ///
249    /// Corresponds to Python: `get_licenses_db()` in models.py
250    pub licenses_by_key: HashMap<String, crate::license_detection::models::License>,
251
252    /// Maps AhoCorasick pattern_id to rule ids (rids).
253    ///
254    /// This is needed because the AhoCorasick pattern_id is just the index
255    /// in the patterns iterator used to build the automaton, not the actual
256    /// rule id. In Python, the automaton stores (rid, start, end) tuples as
257    /// values, so the rid is retrieved from the stored value. In Rust, we
258    /// maintain this mapping instead.
259    ///
260    /// Multiple rules can share the same token pattern (e.g., rules that differ
261    /// only in license_expression). Each pattern_id maps to a list of all rule IDs
262    /// that share that pattern.
263    ///
264    /// Corresponds to Python: automaton values contain (rid, istart, iend)
265    pub pattern_id_to_rid: Vec<Vec<usize>>,
266
267    /// Mapping from SPDX license key to rule ID.
268    ///
269    /// Enables direct lookup of rules by their SPDX license key,
270    /// including aliases like "GPL-2.0+" -> gpl-2.0-plus.
271    ///
272    /// Keys are stored lowercase for case-insensitive lookup.
273    ///
274    /// Corresponds to Python: `self.licenses_by_spdx_key` in cache.py
275    pub rid_by_spdx_key: HashMap<String, usize>,
276
277    /// Rule ID for the unknown-spdx license.
278    ///
279    /// Used as a fallback when an SPDX identifier is not recognized.
280    ///
281    /// Corresponds to Python: `get_unknown_spdx_symbol()` in cache.py
282    pub unknown_spdx_rid: Option<usize>,
283
284    /// Inverted index mapping high-value token IDs to rule IDs.
285    ///
286    /// This enables fast candidate selection by only examining rules
287    /// that share at least one high-value (legalese) token with the query.
288    /// Without this index, candidate selection would iterate over all 37,000+
289    /// rules for every file, making license detection extremely slow.
290    ///
291    /// Only contains entries for tokens with ID < len_legalese (high-value tokens).
292    /// Rules not in approx_matchable_rids are excluded from this index.
293    pub rids_by_high_tid: HashMap<TokenId, HashSet<usize>>,
294}
295
296impl LicenseIndex {}
297
298impl LicenseIndex {
299    /// Create a new empty license index.
300    ///
301    /// This constructor initializes all index structures with empty collections.
302    /// The index can be populated with rules using the indexing methods (to be
303    /// implemented in future phases).
304    ///
305    /// # Returns
306    /// A new LicenseIndex instance with empty index structures
307    pub fn new(dictionary: TokenDictionary) -> Self {
308        use crate::license_detection::automaton::AutomatonBuilder;
309
310        let len_legalese = dictionary.legalese_count();
311        Self {
312            dictionary,
313            len_legalese,
314            rid_by_hash: HashMap::new(),
315            rules_by_rid: Vec::new(),
316            tids_by_rid: Vec::new(),
317            rules_automaton: AutomatonBuilder::new().build(),
318            unknown_automaton: AutomatonBuilder::new().build(),
319            sets_by_rid: HashMap::new(),
320            rule_metadata_by_identifier: HashMap::new(),
321            msets_by_rid: HashMap::new(),
322            high_sets_by_rid: HashMap::new(),
323            high_postings_by_rid: HashMap::new(),
324            false_positive_rids: HashSet::new(),
325            approx_matchable_rids: HashSet::new(),
326            licenses_by_key: HashMap::new(),
327            pattern_id_to_rid: Vec::new(),
328            rid_by_spdx_key: HashMap::new(),
329            unknown_spdx_rid: None,
330            rids_by_high_tid: HashMap::new(),
331        }
332    }
333
334    /// Create a new empty license index with the specified legalese count.
335    ///
336    /// Convenience method that creates a new TokenDictionary and LicenseIndex
337    /// in one call.
338    ///
339    /// # Arguments
340    /// * `legalese_count` - Number of reserved legalese token IDs
341    ///
342    /// # Returns
343    /// A new LicenseIndex instance with a new TokenDictionary
344    pub fn with_legalese_count(legalese_count: usize) -> Self {
345        Self::new(TokenDictionary::new(legalese_count))
346    }
347}
348
349impl Default for LicenseIndex {
350    fn default() -> Self {
351        Self::with_legalese_count(0)
352    }
353}
354
355#[cfg(test)]
356mod tests {
357    use super::*;
358
359    fn simple_license(key: &str, name: &str, spdx: &str, category: &str, text: &str) -> License {
360        License {
361            key: key.to_string(),
362            short_name: Some(name.to_string()),
363            name: name.to_string(),
364            language: Some("en".to_string()),
365            spdx_license_key: Some(spdx.to_string()),
366            other_spdx_license_keys: vec![],
367            category: Some(category.to_string()),
368            owner: None,
369            homepage_url: None,
370            text: text.to_string(),
371            reference_urls: vec![],
372            osi_license_key: Some(spdx.to_string()),
373            text_urls: vec![],
374            osi_url: None,
375            faq_url: None,
376            other_urls: vec![],
377            notes: None,
378            is_deprecated: false,
379            is_exception: false,
380            is_unknown: false,
381            is_generic: false,
382            replaced_by: vec![],
383            minimum_coverage: None,
384            standard_notice: None,
385            ignorable_copyrights: None,
386            ignorable_holders: None,
387            ignorable_authors: None,
388            ignorable_urls: None,
389            ignorable_emails: None,
390        }
391    }
392    use crate::license_detection::models::License;
393
394    #[test]
395    fn test_license_index_new() {
396        let dict = TokenDictionary::new(10);
397        let index = LicenseIndex::new(dict);
398
399        assert_eq!(index.dictionary.legalese_count(), 10);
400        assert!(index.rid_by_hash.is_empty());
401        assert!(index.sets_by_rid.is_empty());
402        assert!(index.msets_by_rid.is_empty());
403        assert!(index.high_postings_by_rid.is_empty());
404        assert!(index.false_positive_rids.is_empty());
405        assert!(index.approx_matchable_rids.is_empty());
406        assert!(index.licenses_by_key.is_empty());
407    }
408
409    #[test]
410    fn test_license_index_with_legalese_count() {
411        let index = LicenseIndex::with_legalese_count(15);
412
413        assert_eq!(index.dictionary.legalese_count(), 15);
414        assert!(index.rid_by_hash.is_empty());
415    }
416
417    #[test]
418    fn test_license_index_default() {
419        let index = LicenseIndex::default();
420
421        assert_eq!(index.dictionary.legalese_count(), 0);
422        assert!(index.rid_by_hash.is_empty());
423    }
424
425    #[test]
426    fn test_automaton_default() {
427        use crate::license_detection::automaton::AutomatonBuilder;
428
429        let automaton = AutomatonBuilder::new().build();
430        let _ = format!("{:?}", automaton);
431    }
432
433    #[test]
434    fn test_license_index_clone() {
435        let index = LicenseIndex::with_legalese_count(5);
436        let cloned = index.clone();
437
438        assert_eq!(cloned.dictionary.legalese_count(), 5);
439        assert!(cloned.rid_by_hash.is_empty());
440    }
441
442    #[test]
443    fn test_license_index_add_license() {
444        let mut index = LicenseIndex::default();
445
446        let license = simple_license(
447            "test-license",
448            "Test License",
449            "TEST",
450            "Permissive",
451            "Test license text",
452        );
453
454        index.licenses_by_key.insert(license.key.clone(), license);
455
456        assert_eq!(index.licenses_by_key.len(), 1);
457        assert!(index.licenses_by_key.contains_key("test-license"));
458    }
459
460    #[test]
461    fn test_license_index_add_licenses() {
462        let mut index = LicenseIndex::default();
463
464        let licenses = vec![
465            simple_license(
466                "license-1",
467                "License 1",
468                "LIC1",
469                "Permissive",
470                "License 1 text",
471            ),
472            simple_license(
473                "license-2",
474                "License 2",
475                "LIC2",
476                "Copyleft",
477                "License 2 text",
478            ),
479        ];
480
481        for license in licenses {
482            index.licenses_by_key.insert(license.key.clone(), license);
483        }
484
485        assert_eq!(index.licenses_by_key.len(), 2);
486        assert!(index.licenses_by_key.contains_key("license-1"));
487        assert!(index.licenses_by_key.contains_key("license-2"));
488    }
489
490    #[test]
491    fn test_license_index_get_license() {
492        let mut index = LicenseIndex::default();
493
494        let license = simple_license(
495            "mit",
496            "MIT License",
497            "MIT",
498            "Permissive",
499            "MIT License text",
500        );
501
502        index.licenses_by_key.insert(license.key.clone(), license);
503
504        let retrieved = index.licenses_by_key.get("mit");
505        assert!(retrieved.is_some());
506        assert_eq!(retrieved.unwrap().name, "MIT License");
507
508        assert!(!index.licenses_by_key.contains_key("unknown"));
509    }
510
511    #[test]
512    fn test_license_index_license_count() {
513        let mut index = LicenseIndex::default();
514
515        assert_eq!(index.licenses_by_key.len(), 0);
516
517        let license = simple_license("test", "Test", "TEST", "Permissive", "Text");
518
519        index.licenses_by_key.insert(license.key.clone(), license);
520
521        assert_eq!(index.licenses_by_key.len(), 1);
522    }
523}