provenant/license_detection/index/mod.rs
1//! License index construction and querying.
2
3pub mod builder;
4pub mod dictionary;
5
6// build_index is used by library tests (see spdx_lid/test.rs, index/builder/tests.rs)
7// even though the binary doesn't use it directly.
8#[allow(unused_imports)]
9pub use builder::{
10 build_index, build_index_from_loaded, build_index_from_loaded_with_automatons,
11 loaded_license_to_license, loaded_rule_to_rule,
12};
13
14use crate::license_detection::automaton::{AsBytes, Automaton};
15use crate::license_detection::index::dictionary::{TokenDictionary, TokenId};
16use crate::license_detection::{TokenMultiset, TokenSet};
17use rkyv::Archive;
18use std::collections::{HashMap, HashSet};
19
20#[derive(Debug, Clone, Default, PartialEq, Eq, Archive, rkyv::Serialize, rkyv::Deserialize)]
21pub struct IndexedRuleMetadata {
22 pub license_expression_spdx: Option<String>,
23 pub skip_for_required_phrase_generation: bool,
24 pub replaced_by: Vec<String>,
25}
26
27/// License index containing all data structures for efficient license detection.
28///
29/// The LicenseIndex holds multiple index structures that enable different matching
30/// strategies: hash-based exact matching, Aho-Corasick automaton matching, set-based
31/// candidate selection, and sequence matching.
32///
33/// Based on the Python ScanCode Toolkit implementation at:
34/// reference/scancode-toolkit/src/licensedcode/index.py
35///
36/// # Index Structures
37///
38/// The index maintains several data structures for different matching strategies:
39///
40/// - **Hash matching**: `rid_by_hash` for exact hash-based matches
41/// - **Automaton matching**: `rules_automaton` and `unknown_automaton` for pattern matching
42/// - **Candidate selection**: `sets_by_rid` and `msets_by_rid` for set-based ranking
43/// - **Sequence matching**: `high_postings_by_rid` for high-value token position tracking
44/// - **Rule classification**: `false_positive_rids`, `approx_matchable_rids`
45#[derive(Debug, Clone, Archive, rkyv::Serialize, rkyv::Deserialize)]
46pub struct LicenseIndex {
47 /// Token dictionary mapping token strings to integer IDs.
48 ///
49 /// IDs 0 to len_legalese-1 are reserved for legalese tokens (high-value words).
50 /// IDs len_legalese and above are assigned to other tokens as encountered.
51 pub dictionary: TokenDictionary,
52
53 /// Number of legalese tokens.
54 ///
55 /// Tokens with ID < len_legalese are considered high-value legalese words.
56 /// Tokens with ID >= len_legalese are considered low-value tokens.
57 ///
58 /// Corresponds to Python: `self.len_legalese = 0` (line 185)
59 pub len_legalese: usize,
60
61 /// Mapping from rule hash to rule ID for hash-based exact matching.
62 ///
63 /// This enables fast exact matches using a hash of the rule\'s token IDs.
64 /// Each hash maps to exactly one rule ID.
65 ///
66 /// Note: The hash is a 20-byte SHA1 digest, stored as a key in HashMap.
67 /// In practice, we use a HashMap<[u8; 20], usize>.
68 ///
69 /// Corresponds to Python: `self.rid_by_hash = {}` (line 216)
70 pub rid_by_hash: HashMap<[u8; 20], usize>,
71
72 /// Rules indexed by rule ID.
73 ///
74 /// Maps rule IDs to Rule objects for quick lookup.
75 ///
76 /// Corresponds to Python: `self.rules_by_rid = []` (line 201)
77 pub rules_by_rid: Vec<crate::license_detection::models::Rule>,
78
79 /// Token ID sequences indexed by rule ID.
80 ///
81 /// Maps rule IDs to their token ID sequences.
82 ///
83 /// Corresponds to Python: `self.tids_by_rid = []` (line 204)
84 pub tids_by_rid: Vec<Vec<TokenId>>,
85
86 /// Aho-Corasick automaton built from all rule token sequences.
87 ///
88 /// Supports efficient multi-pattern matching of token ID sequences.
89 /// Used for exact matching of complete rules or rule fragments in query text.
90 ///
91 /// Corresponds to Python: `self.rules_automaton = match_aho.get_automaton()` (line 219)
92 #[rkyv(with = AsBytes)]
93 pub rules_automaton: Automaton,
94
95 /// Aho-Corasick automaton for unknown license detection.
96 ///
97 /// Separate automaton used to detect license-like text that doesn\'t match
98 /// any known rule. Populated with ngrams from all approx-matchable rules.
99 ///
100 /// Corresponds to Python: `self.unknown_automaton = match_unknown.get_automaton()` (line 222)
101 #[rkyv(with = AsBytes)]
102 pub unknown_automaton: Automaton,
103
104 /// Token ID sets per rule for candidate selection.
105 ///
106 /// Maps rule IDs to sets of unique token IDs present in that rule.
107 /// Used for efficient candidate selection based on token overlap.
108 ///
109 /// Corresponds to Python: `self.sets_by_rid = []` (line 212)
110 pub sets_by_rid: HashMap<usize, TokenSet>,
111
112 pub rule_metadata_by_identifier: HashMap<String, IndexedRuleMetadata>,
113
114 /// Token ID multisets per rule for candidate ranking.
115 ///
116 /// Maps rule IDs to multisets (bags) of token IDs with their frequencies.
117 /// Used for ranking candidates by token frequency overlap.
118 ///
119 /// Corresponds to Python: `self.msets_by_rid = []` (line 213)
120 pub msets_by_rid: HashMap<usize, TokenMultiset>,
121
122 /// High-value token sets per rule for early candidate rejection.
123 ///
124 /// Maps rule IDs to sets containing only high-value (legalese) token IDs.
125 /// This is a subset of `sets_by_rid` for faster intersection computation
126 /// and early rejection of candidates that won't pass the high-token threshold.
127 ///
128 /// Precomputed during index building to avoid redundant filtering at runtime.
129 pub high_sets_by_rid: HashMap<usize, TokenSet>,
130
131 /// Inverted index of high-value token positions per rule.
132 ///
133 /// Maps rule IDs to a mapping from high-value token IDs to their positions
134 /// within the rule. Only contains positions for tokens with IDs < len_legalese.
135 ///
136 /// This structure speeds up sequence matching by allowing quick lookup of
137 /// where high-value tokens appear in each rule.
138 ///
139 /// Corresponds to Python: `self.high_postings_by_rid = []` (line 209)
140 /// In Python: `postings = {tid: array('h', [positions, ...])}`
141 pub high_postings_by_rid: HashMap<usize, HashMap<TokenId, Vec<usize>>>,
142
143 /// Set of rule IDs for false positive rules.
144 ///
145 /// False positive rules are used for exact matching and post-matching
146 /// filtering to subtract spurious matches.
147 ///
148 /// Corresponds to Python: `self.false_positive_rids = set()` (line 230)
149 pub false_positive_rids: HashSet<usize>,
150
151 /// Set of rule IDs that can be matched approximately.
152 ///
153 /// Only rules marked as approx-matchable participate in sequence matching.
154 /// Other rules can only be matched exactly using the automaton.
155 ///
156 /// Note: This field is kept for Python parity documentation and test usage.
157 /// The inverted index (`rids_by_high_tid`) now handles candidate filtering
158 /// more efficiently, making direct iteration over this set unnecessary.
159 ///
160 /// Corresponds to Python: `self.approx_matchable_rids = set()` (line 234)
161 #[allow(dead_code)]
162 pub approx_matchable_rids: HashSet<usize>,
163
164 /// Mapping from ScanCode license key to License object.
165 ///
166 /// Provides access to license metadata for building SPDX mappings
167 /// and validating license expressions.
168 ///
169 /// Corresponds to Python: `get_licenses_db()` in models.py
170 pub licenses_by_key: HashMap<String, crate::license_detection::models::License>,
171
172 /// Maps AhoCorasick pattern_id to rule ids (rids).
173 ///
174 /// This is needed because the AhoCorasick pattern_id is just the index
175 /// in the patterns iterator used to build the automaton, not the actual
176 /// rule id. In Python, the automaton stores (rid, start, end) tuples as
177 /// values, so the rid is retrieved from the stored value. In Rust, we
178 /// maintain this mapping instead.
179 ///
180 /// Multiple rules can share the same token pattern (e.g., rules that differ
181 /// only in license_expression). Each pattern_id maps to a list of all rule IDs
182 /// that share that pattern.
183 ///
184 /// Corresponds to Python: automaton values contain (rid, istart, iend)
185 pub pattern_id_to_rid: Vec<Vec<usize>>,
186
187 /// Mapping from SPDX license key to rule ID.
188 ///
189 /// Enables direct lookup of rules by their SPDX license key,
190 /// including aliases like "GPL-2.0+" -> gpl-2.0-plus.
191 ///
192 /// Keys are stored lowercase for case-insensitive lookup.
193 ///
194 /// Corresponds to Python: `self.licenses_by_spdx_key` in cache.py
195 pub rid_by_spdx_key: HashMap<String, usize>,
196
197 /// Rule ID for the unknown-spdx license.
198 ///
199 /// Used as a fallback when an SPDX identifier is not recognized.
200 ///
201 /// Corresponds to Python: `get_unknown_spdx_symbol()` in cache.py
202 pub unknown_spdx_rid: Option<usize>,
203
204 /// Inverted index mapping high-value token IDs to rule IDs.
205 ///
206 /// This enables fast candidate selection by only examining rules
207 /// that share at least one high-value (legalese) token with the query.
208 /// Without this index, candidate selection would iterate over all 37,000+
209 /// rules for every file, making license detection extremely slow.
210 ///
211 /// Only contains entries for tokens with ID < len_legalese (high-value tokens).
212 /// Rules not in approx_matchable_rids are excluded from this index.
213 pub rids_by_high_tid: HashMap<TokenId, HashSet<usize>>,
214
215 /// SPDX license list version used to build this index.
216 pub spdx_license_list_version: Option<String>,
217}
218
219impl LicenseIndex {}
220
221impl LicenseIndex {
222 /// Create a new empty license index.
223 ///
224 /// This constructor initializes all index structures with empty collections.
225 /// The index can be populated with rules using the indexing methods (to be
226 /// implemented in future phases).
227 ///
228 /// # Returns
229 /// A new LicenseIndex instance with empty index structures
230 pub fn new(dictionary: TokenDictionary) -> Self {
231 use crate::license_detection::automaton::AutomatonBuilder;
232
233 let len_legalese = dictionary.legalese_count();
234 Self {
235 dictionary,
236 len_legalese,
237 rid_by_hash: HashMap::new(),
238 rules_by_rid: Vec::new(),
239 tids_by_rid: Vec::new(),
240 rules_automaton: AutomatonBuilder::new().build(),
241 unknown_automaton: AutomatonBuilder::new().build(),
242 sets_by_rid: HashMap::new(),
243 rule_metadata_by_identifier: HashMap::new(),
244 msets_by_rid: HashMap::new(),
245 high_sets_by_rid: HashMap::new(),
246 high_postings_by_rid: HashMap::new(),
247 false_positive_rids: HashSet::new(),
248 approx_matchable_rids: HashSet::new(),
249 licenses_by_key: HashMap::new(),
250 pattern_id_to_rid: Vec::new(),
251 rid_by_spdx_key: HashMap::new(),
252 unknown_spdx_rid: None,
253 rids_by_high_tid: HashMap::new(),
254 spdx_license_list_version: None,
255 }
256 }
257
258 /// Create a new empty license index with the specified legalese count.
259 ///
260 /// Convenience method that creates a new TokenDictionary and LicenseIndex
261 /// in one call.
262 ///
263 /// # Arguments
264 /// * `legalese_count` - Number of reserved legalese token IDs
265 ///
266 /// # Returns
267 /// A new LicenseIndex instance with a new TokenDictionary
268 pub fn with_legalese_count(legalese_count: usize) -> Self {
269 Self::new(TokenDictionary::new(legalese_count))
270 }
271}
272
273impl Default for LicenseIndex {
274 fn default() -> Self {
275 Self::with_legalese_count(0)
276 }
277}
278
279#[cfg(test)]
280mod tests {
281 use super::*;
282
283 fn simple_license(key: &str, name: &str, spdx: &str, category: &str, text: &str) -> License {
284 License {
285 key: key.to_string(),
286 short_name: Some(name.to_string()),
287 name: name.to_string(),
288 language: Some("en".to_string()),
289 spdx_license_key: Some(spdx.to_string()),
290 other_spdx_license_keys: vec![],
291 category: Some(category.to_string()),
292 owner: None,
293 homepage_url: None,
294 text: text.to_string(),
295 reference_urls: vec![],
296 osi_license_key: Some(spdx.to_string()),
297 text_urls: vec![],
298 osi_url: None,
299 faq_url: None,
300 other_urls: vec![],
301 notes: None,
302 is_deprecated: false,
303 is_exception: false,
304 is_unknown: false,
305 is_generic: false,
306 replaced_by: vec![],
307 minimum_coverage: None,
308 standard_notice: None,
309 ignorable_copyrights: None,
310 ignorable_holders: None,
311 ignorable_authors: None,
312 ignorable_urls: None,
313 ignorable_emails: None,
314 }
315 }
316 use crate::license_detection::models::License;
317
318 #[test]
319 fn test_license_index_new() {
320 let dict = TokenDictionary::new(10);
321 let index = LicenseIndex::new(dict);
322
323 assert_eq!(index.dictionary.legalese_count(), 10);
324 assert!(index.rid_by_hash.is_empty());
325 assert!(index.sets_by_rid.is_empty());
326 assert!(index.msets_by_rid.is_empty());
327 assert!(index.high_postings_by_rid.is_empty());
328 assert!(index.false_positive_rids.is_empty());
329 assert!(index.approx_matchable_rids.is_empty());
330 assert!(index.licenses_by_key.is_empty());
331 }
332
333 #[test]
334 fn test_license_index_with_legalese_count() {
335 let index = LicenseIndex::with_legalese_count(15);
336
337 assert_eq!(index.dictionary.legalese_count(), 15);
338 assert!(index.rid_by_hash.is_empty());
339 }
340
341 #[test]
342 fn test_license_index_default() {
343 let index = LicenseIndex::default();
344
345 assert_eq!(index.dictionary.legalese_count(), 0);
346 assert!(index.rid_by_hash.is_empty());
347 }
348
349 #[test]
350 fn test_automaton_default() {
351 use crate::license_detection::automaton::AutomatonBuilder;
352
353 let automaton = AutomatonBuilder::new().build();
354 let _ = format!("{:?}", automaton);
355 }
356
357 #[test]
358 fn test_license_index_clone() {
359 let index = LicenseIndex::with_legalese_count(5);
360 let cloned = index.clone();
361
362 assert_eq!(cloned.dictionary.legalese_count(), 5);
363 assert!(cloned.rid_by_hash.is_empty());
364 }
365
366 #[test]
367 fn test_license_index_add_license() {
368 let mut index = LicenseIndex::default();
369
370 let license = simple_license(
371 "test-license",
372 "Test License",
373 "TEST",
374 "Permissive",
375 "Test license text",
376 );
377
378 index.licenses_by_key.insert(license.key.clone(), license);
379
380 assert_eq!(index.licenses_by_key.len(), 1);
381 assert!(index.licenses_by_key.contains_key("test-license"));
382 }
383
384 #[test]
385 fn test_license_index_add_licenses() {
386 let mut index = LicenseIndex::default();
387
388 let licenses = vec![
389 simple_license(
390 "license-1",
391 "License 1",
392 "LIC1",
393 "Permissive",
394 "License 1 text",
395 ),
396 simple_license(
397 "license-2",
398 "License 2",
399 "LIC2",
400 "Copyleft",
401 "License 2 text",
402 ),
403 ];
404
405 for license in licenses {
406 index.licenses_by_key.insert(license.key.clone(), license);
407 }
408
409 assert_eq!(index.licenses_by_key.len(), 2);
410 assert!(index.licenses_by_key.contains_key("license-1"));
411 assert!(index.licenses_by_key.contains_key("license-2"));
412 }
413
414 #[test]
415 fn test_license_index_get_license() {
416 let mut index = LicenseIndex::default();
417
418 let license = simple_license(
419 "mit",
420 "MIT License",
421 "MIT",
422 "Permissive",
423 "MIT License text",
424 );
425
426 index.licenses_by_key.insert(license.key.clone(), license);
427
428 let retrieved = index.licenses_by_key.get("mit");
429 assert!(retrieved.is_some());
430 assert_eq!(retrieved.unwrap().name, "MIT License");
431
432 assert!(!index.licenses_by_key.contains_key("unknown"));
433 }
434
435 #[test]
436 fn test_license_index_license_count() {
437 let mut index = LicenseIndex::default();
438
439 assert_eq!(index.licenses_by_key.len(), 0);
440
441 let license = simple_license("test", "Test", "TEST", "Permissive", "Text");
442
443 index.licenses_by_key.insert(license.key.clone(), license);
444
445 assert_eq!(index.licenses_by_key.len(), 1);
446 }
447}