provenant/license_detection/index/mod.rs
1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! License index construction and querying.
5
6pub mod builder;
7pub mod dictionary;
8
9// build_index is used by library tests (see spdx_lid/test.rs, index/builder/tests.rs)
10// even though the binary doesn't use it directly.
11#[allow(unused_imports)]
12pub use builder::{
13 build_index, build_index_from_loaded, build_index_from_loaded_with_automatons,
14 loaded_license_to_license, loaded_rule_to_rule,
15};
16
17use crate::license_detection::automaton::{AsBytes, Automaton};
18use crate::license_detection::index::dictionary::{TokenDictionary, TokenId};
19use crate::license_detection::{TokenMultiset, TokenSet};
20use rkyv::Archive;
21use std::collections::{HashMap, HashSet};
22
23#[derive(Debug, Clone, Default, PartialEq, Eq, Archive, rkyv::Serialize, rkyv::Deserialize)]
24pub struct IndexedRuleMetadata {
25 pub license_expression_spdx: Option<String>,
26 pub skip_for_required_phrase_generation: bool,
27 pub replaced_by: Vec<String>,
28}
29
30/// License index containing all data structures for efficient license detection.
31///
32/// The LicenseIndex holds multiple index structures that enable different matching
33/// strategies: hash-based exact matching, Aho-Corasick automaton matching, set-based
34/// candidate selection, and sequence matching.
35///
36/// Based on the Python ScanCode Toolkit implementation at:
37/// reference/scancode-toolkit/src/licensedcode/index.py
38///
39/// # Index Structures
40///
41/// The index maintains several data structures for different matching strategies:
42///
43/// - **Hash matching**: `rid_by_hash` for exact hash-based matches
44/// - **Automaton matching**: `rules_automaton` and `unknown_automaton` for pattern matching
45/// - **Candidate selection**: `sets_by_rid` and `msets_by_rid` for set-based ranking
46/// - **Sequence matching**: `high_postings_by_rid` for high-value token position tracking
47/// - **Rule classification**: `false_positive_rids`, `approx_matchable_rids`
48#[derive(Debug, Clone, Archive, rkyv::Serialize, rkyv::Deserialize)]
49pub struct LicenseIndex {
50 /// Token dictionary mapping token strings to integer IDs.
51 ///
52 /// IDs 0 to len_legalese-1 are reserved for legalese tokens (high-value words).
53 /// IDs len_legalese and above are assigned to other tokens as encountered.
54 pub dictionary: TokenDictionary,
55
56 /// Number of legalese tokens.
57 ///
58 /// Tokens with ID < len_legalese are considered high-value legalese words.
59 /// Tokens with ID >= len_legalese are considered low-value tokens.
60 ///
61 /// Corresponds to Python: `self.len_legalese = 0` (line 185)
62 pub len_legalese: usize,
63
64 /// Mapping from rule hash to rule ID for hash-based exact matching.
65 ///
66 /// This enables fast exact matches using a hash of the rule\'s token IDs.
67 /// Each hash maps to exactly one rule ID.
68 ///
69 /// Note: The hash is a 20-byte SHA1 digest, stored as a key in HashMap.
70 /// In practice, we use a HashMap<[u8; 20], usize>.
71 ///
72 /// Corresponds to Python: `self.rid_by_hash = {}` (line 216)
73 pub rid_by_hash: HashMap<[u8; 20], usize>,
74
75 /// Rules indexed by rule ID.
76 ///
77 /// Maps rule IDs to Rule objects for quick lookup.
78 ///
79 /// Corresponds to Python: `self.rules_by_rid = []` (line 201)
80 pub rules_by_rid: Vec<crate::license_detection::models::Rule>,
81
82 /// Token ID sequences indexed by rule ID.
83 ///
84 /// Maps rule IDs to their token ID sequences.
85 ///
86 /// Corresponds to Python: `self.tids_by_rid = []` (line 204)
87 pub tids_by_rid: Vec<Vec<TokenId>>,
88
89 /// Aho-Corasick automaton built from all rule token sequences.
90 ///
91 /// Supports efficient multi-pattern matching of token ID sequences.
92 /// Used for exact matching of complete rules or rule fragments in query text.
93 ///
94 /// Corresponds to Python: `self.rules_automaton = match_aho.get_automaton()` (line 219)
95 #[rkyv(with = AsBytes)]
96 pub rules_automaton: Automaton,
97
98 /// Aho-Corasick automaton for unknown license detection.
99 ///
100 /// Separate automaton used to detect license-like text that doesn\'t match
101 /// any known rule. Populated with ngrams from all approx-matchable rules.
102 ///
103 /// Corresponds to Python: `self.unknown_automaton = match_unknown.get_automaton()` (line 222)
104 #[rkyv(with = AsBytes)]
105 pub unknown_automaton: Automaton,
106
107 /// Token ID sets per rule for candidate selection.
108 ///
109 /// Maps rule IDs to sets of unique token IDs present in that rule.
110 /// Used for efficient candidate selection based on token overlap.
111 ///
112 /// Corresponds to Python: `self.sets_by_rid = []` (line 212)
113 pub sets_by_rid: HashMap<usize, TokenSet>,
114
115 pub rule_metadata_by_identifier: HashMap<String, IndexedRuleMetadata>,
116
117 /// Token ID multisets per rule for candidate ranking.
118 ///
119 /// Maps rule IDs to multisets (bags) of token IDs with their frequencies.
120 /// Used for ranking candidates by token frequency overlap.
121 ///
122 /// Corresponds to Python: `self.msets_by_rid = []` (line 213)
123 pub msets_by_rid: HashMap<usize, TokenMultiset>,
124
125 /// High-value token sets per rule for early candidate rejection.
126 ///
127 /// Maps rule IDs to sets containing only high-value (legalese) token IDs.
128 /// This is a subset of `sets_by_rid` for faster intersection computation
129 /// and early rejection of candidates that won't pass the high-token threshold.
130 ///
131 /// Precomputed during index building to avoid redundant filtering at runtime.
132 pub high_sets_by_rid: HashMap<usize, TokenSet>,
133
134 /// Inverted index of high-value token positions per rule.
135 ///
136 /// Maps rule IDs to a mapping from high-value token IDs to their positions
137 /// within the rule. Only contains positions for tokens with IDs < len_legalese.
138 ///
139 /// This structure speeds up sequence matching by allowing quick lookup of
140 /// where high-value tokens appear in each rule.
141 ///
142 /// Corresponds to Python: `self.high_postings_by_rid = []` (line 209)
143 /// In Python: `postings = {tid: array('h', [positions, ...])}`
144 pub high_postings_by_rid: HashMap<usize, HashMap<TokenId, Vec<usize>>>,
145
146 /// Set of rule IDs for false positive rules.
147 ///
148 /// False positive rules are used for exact matching and post-matching
149 /// filtering to subtract spurious matches.
150 ///
151 /// Corresponds to Python: `self.false_positive_rids = set()` (line 230)
152 pub false_positive_rids: HashSet<usize>,
153
154 /// Set of rule IDs that can be matched approximately.
155 ///
156 /// Only rules marked as approx-matchable participate in sequence matching.
157 /// Other rules can only be matched exactly using the automaton.
158 ///
159 /// Note: This field is kept for Python parity documentation and test usage.
160 /// The inverted index (`rids_by_high_tid`) now handles candidate filtering
161 /// more efficiently, making direct iteration over this set unnecessary.
162 ///
163 /// Corresponds to Python: `self.approx_matchable_rids = set()` (line 234)
164 #[allow(dead_code)]
165 pub approx_matchable_rids: HashSet<usize>,
166
167 /// Mapping from ScanCode license key to License object.
168 ///
169 /// Provides access to license metadata for building SPDX mappings
170 /// and validating license expressions.
171 ///
172 /// Corresponds to Python: `get_licenses_db()` in models.py
173 pub licenses_by_key: HashMap<String, crate::license_detection::models::License>,
174
175 /// Maps AhoCorasick pattern_id to rule ids (rids).
176 ///
177 /// This is needed because the AhoCorasick pattern_id is just the index
178 /// in the patterns iterator used to build the automaton, not the actual
179 /// rule id. In Python, the automaton stores (rid, start, end) tuples as
180 /// values, so the rid is retrieved from the stored value. In Rust, we
181 /// maintain this mapping instead.
182 ///
183 /// Multiple rules can share the same token pattern (e.g., rules that differ
184 /// only in license_expression). Each pattern_id maps to a list of all rule IDs
185 /// that share that pattern.
186 ///
187 /// Corresponds to Python: automaton values contain (rid, istart, iend)
188 pub pattern_id_to_rid: Vec<Vec<usize>>,
189
190 /// Mapping from SPDX license key to rule ID.
191 ///
192 /// Enables direct lookup of rules by their SPDX license key,
193 /// including aliases like "GPL-2.0+" -> gpl-2.0-plus.
194 ///
195 /// Keys are stored lowercase for case-insensitive lookup.
196 ///
197 /// Corresponds to Python: `self.licenses_by_spdx_key` in cache.py
198 pub rid_by_spdx_key: HashMap<String, usize>,
199
200 /// Rule ID for the unknown-spdx license.
201 ///
202 /// Used as a fallback when an SPDX identifier is not recognized.
203 ///
204 /// Corresponds to Python: `get_unknown_spdx_symbol()` in cache.py
205 pub unknown_spdx_rid: Option<usize>,
206
207 /// Inverted index mapping high-value token IDs to rule IDs.
208 ///
209 /// This enables fast candidate selection by only examining rules
210 /// that share at least one high-value (legalese) token with the query.
211 /// Without this index, candidate selection would iterate over all 37,000+
212 /// rules for every file, making license detection extremely slow.
213 ///
214 /// Only contains entries for tokens with ID < len_legalese (high-value tokens).
215 /// Rules not in approx_matchable_rids are excluded from this index.
216 pub rids_by_high_tid: HashMap<TokenId, HashSet<usize>>,
217
218 /// SPDX license list version used to build this index.
219 pub spdx_license_list_version: Option<String>,
220}
221
222impl LicenseIndex {}
223
224impl LicenseIndex {
225 /// Create a new empty license index.
226 ///
227 /// This constructor initializes all index structures with empty collections.
228 /// The index can be populated with rules using the indexing methods (to be
229 /// implemented in future phases).
230 ///
231 /// # Returns
232 /// A new LicenseIndex instance with empty index structures
233 pub fn new(dictionary: TokenDictionary) -> Self {
234 use crate::license_detection::automaton::AutomatonBuilder;
235
236 let len_legalese = dictionary.legalese_count();
237 Self {
238 dictionary,
239 len_legalese,
240 rid_by_hash: HashMap::new(),
241 rules_by_rid: Vec::new(),
242 tids_by_rid: Vec::new(),
243 rules_automaton: AutomatonBuilder::new().build(),
244 unknown_automaton: AutomatonBuilder::new().build(),
245 sets_by_rid: HashMap::new(),
246 rule_metadata_by_identifier: HashMap::new(),
247 msets_by_rid: HashMap::new(),
248 high_sets_by_rid: HashMap::new(),
249 high_postings_by_rid: HashMap::new(),
250 false_positive_rids: HashSet::new(),
251 approx_matchable_rids: HashSet::new(),
252 licenses_by_key: HashMap::new(),
253 pattern_id_to_rid: Vec::new(),
254 rid_by_spdx_key: HashMap::new(),
255 unknown_spdx_rid: None,
256 rids_by_high_tid: HashMap::new(),
257 spdx_license_list_version: None,
258 }
259 }
260
261 /// Create a new empty license index with the specified legalese count.
262 ///
263 /// Convenience method that creates a new TokenDictionary and LicenseIndex
264 /// in one call.
265 ///
266 /// # Arguments
267 /// * `legalese_count` - Number of reserved legalese token IDs
268 ///
269 /// # Returns
270 /// A new LicenseIndex instance with a new TokenDictionary
271 pub fn with_legalese_count(legalese_count: usize) -> Self {
272 Self::new(TokenDictionary::new(legalese_count))
273 }
274}
275
276impl Default for LicenseIndex {
277 fn default() -> Self {
278 Self::with_legalese_count(0)
279 }
280}
281
282#[cfg(test)]
283mod tests {
284 use super::*;
285
286 fn simple_license(key: &str, name: &str, spdx: &str, category: &str, text: &str) -> License {
287 License {
288 key: key.to_string(),
289 short_name: Some(name.to_string()),
290 name: name.to_string(),
291 language: Some("en".to_string()),
292 spdx_license_key: Some(spdx.to_string()),
293 other_spdx_license_keys: vec![],
294 category: Some(category.to_string()),
295 owner: None,
296 homepage_url: None,
297 text: text.to_string(),
298 reference_urls: vec![],
299 osi_license_key: Some(spdx.to_string()),
300 text_urls: vec![],
301 osi_url: None,
302 faq_url: None,
303 other_urls: vec![],
304 notes: None,
305 is_deprecated: false,
306 is_exception: false,
307 is_unknown: false,
308 is_generic: false,
309 replaced_by: vec![],
310 minimum_coverage: None,
311 standard_notice: None,
312 ignorable_copyrights: None,
313 ignorable_holders: None,
314 ignorable_authors: None,
315 ignorable_urls: None,
316 ignorable_emails: None,
317 }
318 }
319 use crate::license_detection::models::License;
320
321 #[test]
322 fn test_license_index_new() {
323 let dict = TokenDictionary::new(10);
324 let index = LicenseIndex::new(dict);
325
326 assert_eq!(index.dictionary.legalese_count(), 10);
327 assert!(index.rid_by_hash.is_empty());
328 assert!(index.sets_by_rid.is_empty());
329 assert!(index.msets_by_rid.is_empty());
330 assert!(index.high_postings_by_rid.is_empty());
331 assert!(index.false_positive_rids.is_empty());
332 assert!(index.approx_matchable_rids.is_empty());
333 assert!(index.licenses_by_key.is_empty());
334 }
335
336 #[test]
337 fn test_license_index_with_legalese_count() {
338 let index = LicenseIndex::with_legalese_count(15);
339
340 assert_eq!(index.dictionary.legalese_count(), 15);
341 assert!(index.rid_by_hash.is_empty());
342 }
343
344 #[test]
345 fn test_license_index_default() {
346 let index = LicenseIndex::default();
347
348 assert_eq!(index.dictionary.legalese_count(), 0);
349 assert!(index.rid_by_hash.is_empty());
350 }
351
352 #[test]
353 fn test_automaton_default() {
354 use crate::license_detection::automaton::AutomatonBuilder;
355
356 let automaton = AutomatonBuilder::new().build();
357 let _ = format!("{:?}", automaton);
358 }
359
360 #[test]
361 fn test_license_index_clone() {
362 let index = LicenseIndex::with_legalese_count(5);
363 let cloned = index.clone();
364
365 assert_eq!(cloned.dictionary.legalese_count(), 5);
366 assert!(cloned.rid_by_hash.is_empty());
367 }
368
369 #[test]
370 fn test_license_index_add_license() {
371 let mut index = LicenseIndex::default();
372
373 let license = simple_license(
374 "test-license",
375 "Test License",
376 "TEST",
377 "Permissive",
378 "Test license text",
379 );
380
381 index.licenses_by_key.insert(license.key.clone(), license);
382
383 assert_eq!(index.licenses_by_key.len(), 1);
384 assert!(index.licenses_by_key.contains_key("test-license"));
385 }
386
387 #[test]
388 fn test_license_index_add_licenses() {
389 let mut index = LicenseIndex::default();
390
391 let licenses = vec![
392 simple_license(
393 "license-1",
394 "License 1",
395 "LIC1",
396 "Permissive",
397 "License 1 text",
398 ),
399 simple_license(
400 "license-2",
401 "License 2",
402 "LIC2",
403 "Copyleft",
404 "License 2 text",
405 ),
406 ];
407
408 for license in licenses {
409 index.licenses_by_key.insert(license.key.clone(), license);
410 }
411
412 assert_eq!(index.licenses_by_key.len(), 2);
413 assert!(index.licenses_by_key.contains_key("license-1"));
414 assert!(index.licenses_by_key.contains_key("license-2"));
415 }
416
417 #[test]
418 fn test_license_index_get_license() {
419 let mut index = LicenseIndex::default();
420
421 let license = simple_license(
422 "mit",
423 "MIT License",
424 "MIT",
425 "Permissive",
426 "MIT License text",
427 );
428
429 index.licenses_by_key.insert(license.key.clone(), license);
430
431 let retrieved = index.licenses_by_key.get("mit");
432 assert!(retrieved.is_some());
433 assert_eq!(retrieved.unwrap().name, "MIT License");
434
435 assert!(!index.licenses_by_key.contains_key("unknown"));
436 }
437
438 #[test]
439 fn test_license_index_license_count() {
440 let mut index = LicenseIndex::default();
441
442 assert_eq!(index.licenses_by_key.len(), 0);
443
444 let license = simple_license("test", "Test", "TEST", "Permissive", "Text");
445
446 index.licenses_by_key.insert(license.key.clone(), license);
447
448 assert_eq!(index.licenses_by_key.len(), 1);
449 }
450}