Skip to main content

provenant/license_detection/index/
mod.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! License index construction and querying.
5
6pub mod builder;
7pub mod dictionary;
8
9#[allow(unused_imports)]
10pub use builder::{
11    build_index, build_index_from_loaded, loaded_license_to_license, loaded_rule_to_rule,
12};
13
14use crate::license_detection::automaton::{AsBytes, Automaton};
15use crate::license_detection::index::dictionary::{TokenDictionary, TokenId};
16use crate::license_detection::models::RuleId;
17use crate::license_detection::{TokenMultiset, TokenSet};
18use rkyv::Archive;
19use std::collections::{HashMap, HashSet};
20
21#[derive(Debug, Clone, Default, PartialEq, Eq, Archive, rkyv::Serialize, rkyv::Deserialize)]
22pub struct IndexedRuleMetadata {
23    pub license_expression_spdx: Option<String>,
24    pub skip_for_required_phrase_generation: bool,
25    pub replaced_by: Vec<String>,
26}
27
28/// License index containing all data structures for efficient license detection.
29///
30/// The LicenseIndex holds multiple index structures that enable different matching
31/// strategies: hash-based exact matching, Aho-Corasick automaton matching, set-based
32/// candidate selection, and sequence matching.
33///
34/// Based on the Python ScanCode Toolkit implementation at:
35/// reference/scancode-toolkit/src/licensedcode/index.py
36///
37/// # Index Structures
38///
39/// The index maintains several data structures for different matching strategies:
40///
41/// - **Hash matching**: `rid_by_hash` for exact hash-based matches
42/// - **Automaton matching**: `rules_automaton` and `unknown_automaton` for pattern matching
43/// - **Candidate selection**: `sets_by_rid` and `msets_by_rid` for set-based ranking
44/// - **Sequence matching**: `high_postings_by_rid` for high-value token position tracking
45#[derive(Debug, Clone, Archive, rkyv::Serialize, rkyv::Deserialize)]
46pub struct LicenseIndex {
47    /// Token dictionary mapping token strings to integer IDs.
48    ///
49    /// IDs 0 to len_legalese-1 are reserved for legalese tokens (high-value words).
50    /// IDs len_legalese and above are assigned to other tokens as encountered.
51    pub dictionary: TokenDictionary,
52
53    /// Number of legalese tokens.
54    ///
55    /// Tokens with ID < len_legalese are considered high-value legalese words.
56    /// Tokens with ID >= len_legalese are considered low-value tokens.
57    ///
58    /// Corresponds to Python: `self.len_legalese = 0` (line 185)
59    pub len_legalese: usize,
60
61    /// Mapping from rule hash to rule ID for hash-based exact matching.
62    ///
63    /// This enables fast exact matches using a hash of the rule\'s token IDs.
64    /// Each hash maps to exactly one rule ID.
65    ///
66    /// Note: The hash is a 20-byte SHA1 digest, stored as a key in HashMap.
67    /// In practice, we use a HashMap<[u8; 20], RuleId>.
68    ///
69    /// Corresponds to Python: `self.rid_by_hash = {}` (line 216)
70    pub rid_by_hash: HashMap<[u8; 20], RuleId>,
71
72    /// Rules indexed by rule ID.
73    ///
74    /// Maps rule IDs to Rule objects for quick lookup.
75    ///
76    /// Corresponds to Python: `self.rules_by_rid = []` (line 201)
77    pub rules_by_rid: Vec<crate::license_detection::models::Rule>,
78
79    /// Token ID sequences indexed by rule ID.
80    ///
81    /// Maps rule IDs to their token ID sequences.
82    ///
83    /// Corresponds to Python: `self.tids_by_rid = []` (line 204)
84    pub tids_by_rid: Vec<Vec<TokenId>>,
85
86    /// Aho-Corasick automaton built from all rule token sequences.
87    ///
88    /// Supports efficient multi-pattern matching of token ID sequences.
89    /// Used for exact matching of complete rules or rule fragments in query text.
90    ///
91    /// Corresponds to Python: `self.rules_automaton = match_aho.get_automaton()` (line 219)
92    #[rkyv(with = AsBytes)]
93    pub rules_automaton: Automaton,
94
95    /// Aho-Corasick automaton for unknown license detection.
96    ///
97    /// Separate automaton used to detect license-like text that doesn\'t match
98    /// any known rule. Populated with ngrams from all approx-matchable rules.
99    ///
100    /// Corresponds to Python: `self.unknown_automaton = match_unknown.get_automaton()` (line 222)
101    #[rkyv(with = AsBytes)]
102    pub unknown_automaton: Automaton,
103
104    /// Token ID sets per rule for candidate selection.
105    ///
106    /// Maps rule IDs to sets of unique token IDs present in that rule.
107    /// Used for efficient candidate selection based on token overlap.
108    ///
109    /// Corresponds to Python: `self.sets_by_rid = []` (line 212)
110    pub sets_by_rid: HashMap<RuleId, TokenSet>,
111
112    pub rule_metadata_by_identifier: HashMap<String, IndexedRuleMetadata>,
113
114    /// Token ID multisets per rule for candidate ranking.
115    ///
116    /// Maps rule IDs to multisets (bags) of token IDs with their frequencies.
117    /// Used for ranking candidates by token frequency overlap.
118    ///
119    /// Corresponds to Python: `self.msets_by_rid = []` (line 213)
120    pub msets_by_rid: HashMap<RuleId, TokenMultiset>,
121
122    /// High-value token sets per rule for early candidate rejection.
123    ///
124    /// Maps rule IDs to sets containing only high-value (legalese) token IDs.
125    /// This is a subset of `sets_by_rid` for faster intersection computation
126    /// and early rejection of candidates that won't pass the high-token threshold.
127    ///
128    /// Precomputed during index building to avoid redundant filtering at runtime.
129    pub high_sets_by_rid: HashMap<RuleId, TokenSet>,
130
131    /// Inverted index of high-value token positions per rule.
132    ///
133    /// Maps rule IDs to a mapping from high-value token IDs to their positions
134    /// within the rule. Only contains positions for tokens with IDs < len_legalese.
135    ///
136    /// This structure speeds up sequence matching by allowing quick lookup of
137    /// where high-value tokens appear in each rule.
138    ///
139    /// Corresponds to Python: `self.high_postings_by_rid = []` (line 209)
140    /// In Python: `postings = {tid: array('h', [positions, ...])}`
141    pub high_postings_by_rid: HashMap<RuleId, HashMap<TokenId, Vec<usize>>>,
142
143    /// Mapping from ScanCode license key to License object.
144    ///
145    /// Provides access to license metadata for building SPDX mappings
146    /// and validating license expressions.
147    ///
148    /// Corresponds to Python: `get_licenses_db()` in models.py
149    pub licenses_by_key: HashMap<String, crate::license_detection::models::License>,
150
151    /// Mapping from SPDX license key to rule ID.
152    ///
153    /// Enables direct lookup of rules by their SPDX license key,
154    /// including aliases like "GPL-2.0+" -> gpl-2.0-plus.
155    ///
156    /// Keys are stored lowercase for case-insensitive lookup.
157    ///
158    /// Corresponds to Python: `self.licenses_by_spdx_key` in cache.py
159    pub rid_by_spdx_key: HashMap<String, RuleId>,
160
161    /// Rule ID for the unknown-spdx license.
162    ///
163    /// Used as a fallback when an SPDX identifier is not recognized.
164    ///
165    /// Corresponds to Python: `get_unknown_spdx_symbol()` in cache.py
166    pub unknown_spdx_rid: Option<RuleId>,
167
168    /// Inverted index mapping high-value token IDs to rule IDs.
169    ///
170    /// This enables fast candidate selection by only examining rules
171    /// that share at least one high-value (legalese) token with the query.
172    /// Without this index, candidate selection would iterate over all 37,000+
173    /// rules for every file, making license detection extremely slow.
174    ///
175    /// Only contains entries for tokens with ID < len_legalese (high-value tokens).
176    /// Only approx-matchable rules are included in this index.
177    pub rids_by_high_tid: HashMap<TokenId, HashSet<RuleId>>,
178
179    /// SPDX license list version used to build this index.
180    pub spdx_license_list_version: Option<String>,
181}
182
183impl LicenseIndex {
184    /// Returns the rule for the given ID, or `None` if the ID is invalid
185    /// or out of range.
186    pub fn rule(&self, id: RuleId) -> Option<&crate::license_detection::models::Rule> {
187        if id.is_none() {
188            return None;
189        }
190        self.rules_by_rid.get(id.raw())
191    }
192
193    /// Returns the token ID sequence for the given rule, or `None` if the ID
194    /// is invalid or out of range.
195    pub fn rule_tokens(&self, id: RuleId) -> Option<&[TokenId]> {
196        if id.is_none() {
197            return None;
198        }
199        self.tids_by_rid.get(id.raw()).map(|v| v.as_slice())
200    }
201
202    pub fn is_false_positive(&self, id: RuleId) -> bool {
203        self.rule(id).is_some_and(|r| r.is_false_positive)
204    }
205}
206
207impl LicenseIndex {
208    /// Create a new empty license index.
209    ///
210    /// This constructor initializes all index structures with empty collections.
211    /// The index can be populated with rules using the indexing methods (to be
212    /// implemented in future phases).
213    ///
214    /// # Returns
215    /// A new LicenseIndex instance with empty index structures
216    pub fn new(dictionary: TokenDictionary) -> Self {
217        use crate::license_detection::automaton::AutomatonBuilder;
218
219        let len_legalese = dictionary.legalese_count();
220        Self {
221            dictionary,
222            len_legalese,
223            rid_by_hash: HashMap::new(),
224            rules_by_rid: Vec::new(),
225            tids_by_rid: Vec::new(),
226            rules_automaton: AutomatonBuilder::new().build(),
227            unknown_automaton: AutomatonBuilder::new().build(),
228            sets_by_rid: HashMap::new(),
229            rule_metadata_by_identifier: HashMap::new(),
230            msets_by_rid: HashMap::new(),
231            high_sets_by_rid: HashMap::new(),
232            high_postings_by_rid: HashMap::new(),
233            licenses_by_key: HashMap::new(),
234            rid_by_spdx_key: HashMap::new(),
235            unknown_spdx_rid: None,
236            rids_by_high_tid: HashMap::new(),
237            spdx_license_list_version: None,
238        }
239    }
240
241    /// Create a new empty license index with the specified legalese count.
242    ///
243    /// Convenience method that creates a new TokenDictionary and LicenseIndex
244    /// in one call.
245    ///
246    /// # Arguments
247    /// * `legalese_count` - Number of reserved legalese token IDs
248    ///
249    /// # Returns
250    /// A new LicenseIndex instance with a new TokenDictionary
251    pub fn with_legalese_count(legalese_count: usize) -> Self {
252        Self::new(TokenDictionary::new(legalese_count))
253    }
254}
255
256impl Default for LicenseIndex {
257    fn default() -> Self {
258        Self::with_legalese_count(0)
259    }
260}
261
262#[cfg(test)]
263mod tests {
264    use super::*;
265
266    fn simple_license(key: &str, name: &str, spdx: &str, category: &str, text: &str) -> License {
267        License {
268            key: key.to_string(),
269            short_name: Some(name.to_string()),
270            name: name.to_string(),
271            language: Some("en".to_string()),
272            spdx_license_key: Some(spdx.to_string()),
273            other_spdx_license_keys: vec![],
274            category: Some(category.to_string()),
275            owner: None,
276            homepage_url: None,
277            text: text.to_string(),
278            reference_urls: vec![],
279            osi_license_key: Some(spdx.to_string()),
280            text_urls: vec![],
281            osi_url: None,
282            faq_url: None,
283            other_urls: vec![],
284            notes: None,
285            is_deprecated: false,
286            is_exception: false,
287            is_unknown: false,
288            is_generic: false,
289            replaced_by: vec![],
290            minimum_coverage: None,
291            standard_notice: None,
292            ignorable_copyrights: None,
293            ignorable_holders: None,
294            ignorable_authors: None,
295            ignorable_urls: None,
296            ignorable_emails: None,
297        }
298    }
299    use crate::license_detection::models::License;
300
301    #[test]
302    fn test_license_index_new() {
303        let dict = TokenDictionary::new(10);
304        let index = LicenseIndex::new(dict);
305
306        assert_eq!(index.dictionary.legalese_count(), 10);
307        assert!(index.rid_by_hash.is_empty());
308        assert!(index.sets_by_rid.is_empty());
309        assert!(index.msets_by_rid.is_empty());
310        assert!(index.high_postings_by_rid.is_empty());
311        assert!(index.licenses_by_key.is_empty());
312    }
313
314    #[test]
315    fn test_license_index_with_legalese_count() {
316        let index = LicenseIndex::with_legalese_count(15);
317
318        assert_eq!(index.dictionary.legalese_count(), 15);
319        assert!(index.rid_by_hash.is_empty());
320    }
321
322    #[test]
323    fn test_license_index_default() {
324        let index = LicenseIndex::default();
325
326        assert_eq!(index.dictionary.legalese_count(), 0);
327        assert!(index.rid_by_hash.is_empty());
328    }
329
330    #[test]
331    fn test_automaton_default() {
332        use crate::license_detection::automaton::AutomatonBuilder;
333
334        let automaton = AutomatonBuilder::new().build();
335        let _ = format!("{:?}", automaton);
336    }
337
338    #[test]
339    fn test_license_index_clone() {
340        let index = LicenseIndex::with_legalese_count(5);
341        let cloned = index.clone();
342
343        assert_eq!(cloned.dictionary.legalese_count(), 5);
344        assert!(cloned.rid_by_hash.is_empty());
345    }
346
347    #[test]
348    fn test_license_index_add_license() {
349        let mut index = LicenseIndex::default();
350
351        let license = simple_license(
352            "test-license",
353            "Test License",
354            "TEST",
355            "Permissive",
356            "Test license text",
357        );
358
359        index.licenses_by_key.insert(license.key.clone(), license);
360
361        assert_eq!(index.licenses_by_key.len(), 1);
362        assert!(index.licenses_by_key.contains_key("test-license"));
363    }
364
365    #[test]
366    fn test_license_index_add_licenses() {
367        let mut index = LicenseIndex::default();
368
369        let licenses = vec![
370            simple_license(
371                "license-1",
372                "License 1",
373                "LIC1",
374                "Permissive",
375                "License 1 text",
376            ),
377            simple_license(
378                "license-2",
379                "License 2",
380                "LIC2",
381                "Copyleft",
382                "License 2 text",
383            ),
384        ];
385
386        for license in licenses {
387            index.licenses_by_key.insert(license.key.clone(), license);
388        }
389
390        assert_eq!(index.licenses_by_key.len(), 2);
391        assert!(index.licenses_by_key.contains_key("license-1"));
392        assert!(index.licenses_by_key.contains_key("license-2"));
393    }
394
395    #[test]
396    fn test_license_index_get_license() {
397        let mut index = LicenseIndex::default();
398
399        let license = simple_license(
400            "mit",
401            "MIT License",
402            "MIT",
403            "Permissive",
404            "MIT License text",
405        );
406
407        index.licenses_by_key.insert(license.key.clone(), license);
408
409        let retrieved = index.licenses_by_key.get("mit");
410        assert!(retrieved.is_some());
411        assert_eq!(retrieved.unwrap().name, "MIT License");
412
413        assert!(!index.licenses_by_key.contains_key("unknown"));
414    }
415
416    #[test]
417    fn test_license_index_license_count() {
418        let mut index = LicenseIndex::default();
419
420        assert_eq!(index.licenses_by_key.len(), 0);
421
422        let license = simple_license("test", "Test", "TEST", "Permissive", "Text");
423
424        index.licenses_by_key.insert(license.key.clone(), license);
425
426        assert_eq!(index.licenses_by_key.len(), 1);
427    }
428}