Skip to main content

sbom_tools/matching/
index.rs

1//! Component index for efficient matching.
2//!
3//! This module provides indexing structures to reduce O(n²) fuzzy comparisons
4//! by pre-normalizing and bucketing components for efficient candidate lookup.
5
6use crate::model::{CanonicalId, Component, NormalizedSbom};
7use rayon::prelude::*;
8use std::collections::{HashMap, HashSet};
9use std::sync::Arc;
10
11/// Pre-computed normalized data for a component.
12#[derive(Debug, Clone)]
13pub struct NormalizedEntry {
14    /// Normalized PURL (if available)
15    pub normalized_purl: Option<String>,
16    /// Normalized component name (lowercase, separators normalized)
17    pub normalized_name: String,
18    /// Length of the normalized name (for length-based filtering)
19    pub name_length: usize,
20    /// Ecosystem extracted from PURL or inferred
21    pub ecosystem: Option<String>,
22    /// First 3 characters of normalized name (for prefix bucketing)
23    pub prefix: String,
24    /// Trigrams (3-character substrings) for fuzzy matching
25    pub trigrams: Vec<String>,
26}
27
28/// Index for efficient component candidate lookup.
29///
30/// Reduces the O(n·m) comparison to O(n·k) where k << m by:
31/// 1. Grouping components by ecosystem
32/// 2. Bucketing by name prefix
33/// 3. Bucketing by trigrams (3-char substrings) for fuzzy matching
34/// 4. Pre-normalizing names for fast comparison
35///
36/// Uses `Arc<CanonicalId>` internally for efficient cloning during index building.
37pub struct ComponentIndex {
38    /// Ecosystem -> list of component IDs in that ecosystem
39    by_ecosystem: HashMap<String, Vec<Arc<CanonicalId>>>,
40    /// Normalized name prefix (first 3 chars) -> component IDs
41    by_prefix: HashMap<String, Vec<Arc<CanonicalId>>>,
42    /// Trigram -> list of component IDs containing that trigram
43    by_trigram: HashMap<String, Vec<Arc<CanonicalId>>>,
44    /// Pre-computed normalized data for each component
45    entries: HashMap<Arc<CanonicalId>, NormalizedEntry>,
46    /// All component IDs (for fallback)
47    all_ids: Vec<Arc<CanonicalId>>,
48}
49
50impl ComponentIndex {
51    /// Build an index from an SBOM.
52    ///
53    /// Uses `Arc<CanonicalId>` internally to avoid expensive cloning of IDs
54    /// across multiple index structures.
55    pub fn build(sbom: &NormalizedSbom) -> Self {
56        let mut by_ecosystem: HashMap<String, Vec<Arc<CanonicalId>>> = HashMap::new();
57        let mut by_prefix: HashMap<String, Vec<Arc<CanonicalId>>> = HashMap::new();
58        let mut by_trigram: HashMap<String, Vec<Arc<CanonicalId>>> = HashMap::new();
59        let mut entries: HashMap<Arc<CanonicalId>, NormalizedEntry> = HashMap::new();
60        let mut all_ids: Vec<Arc<CanonicalId>> = Vec::new();
61
62        for (id, comp) in &sbom.components {
63            let entry = Self::normalize_component(comp);
64            // Wrap ID in Arc once - all subsequent "clones" are cheap reference count increments
65            let arc_id = Arc::new(id.clone());
66
67            // Index by ecosystem
68            if let Some(ref eco) = entry.ecosystem {
69                by_ecosystem
70                    .entry(eco.clone())
71                    .or_default()
72                    .push(Arc::clone(&arc_id));
73            }
74
75            // Index by name prefix
76            if !entry.prefix.is_empty() {
77                by_prefix
78                    .entry(entry.prefix.clone())
79                    .or_default()
80                    .push(Arc::clone(&arc_id));
81            }
82
83            // Index by trigrams
84            for trigram in &entry.trigrams {
85                by_trigram
86                    .entry(trigram.clone())
87                    .or_default()
88                    .push(Arc::clone(&arc_id));
89            }
90
91            entries.insert(Arc::clone(&arc_id), entry);
92            all_ids.push(arc_id);
93        }
94
95        Self {
96            by_ecosystem,
97            by_prefix,
98            by_trigram,
99            entries,
100            all_ids,
101        }
102    }
103
104    /// Normalize a component for indexing.
105    pub fn normalize_component(comp: &Component) -> NormalizedEntry {
106        // Extract ecosystem from PURL
107        let (ecosystem, normalized_purl) = if let Some(ref purl) = comp.identifiers.purl {
108            let eco = Self::extract_ecosystem(purl);
109            let normalized = Self::normalize_purl(purl);
110            (eco, Some(normalized))
111        } else {
112            // Try to infer ecosystem from component type or other fields
113            // Convert Ecosystem enum to String for consistent comparison
114            (comp.ecosystem.as_ref().map(|e| e.to_string()), None)
115        };
116
117        // Normalize name
118        let normalized_name = Self::normalize_name(&comp.name, ecosystem.as_deref());
119        let name_length = normalized_name.len();
120        let prefix = normalized_name.chars().take(3).collect::<String>();
121        let trigrams = Self::compute_trigrams(&normalized_name);
122
123        NormalizedEntry {
124            normalized_purl,
125            normalized_name,
126            name_length,
127            ecosystem,
128            prefix,
129            trigrams,
130        }
131    }
132
133    /// Compute trigrams (3-character substrings) for a normalized name.
134    ///
135    /// Trigrams enable finding matches where only the middle or end differs,
136    /// which prefix-based indexing would miss.
137    fn compute_trigrams(name: &str) -> Vec<String> {
138        if name.len() < 3 {
139            // For very short names, use the name itself as a "trigram"
140            return if name.is_empty() {
141                vec![]
142            } else {
143                vec![name.to_string()]
144            };
145        }
146
147        // Fast path: ASCII-only names (common for package names)
148        // Avoids intermediate Vec<char> allocation
149        if name.is_ascii() {
150            return name
151                .as_bytes()
152                .windows(3)
153                .map(|w| {
154                    // SAFETY: name.is_ascii() was checked above, so all bytes are valid
155                    // single-byte UTF-8 characters. Any 3-byte window is valid UTF-8.
156                    unsafe { std::str::from_utf8_unchecked(w) }.to_string()
157                })
158                .collect();
159        }
160
161        // Slow path: Unicode names - need to collect chars first for windows()
162        let chars: Vec<char> = name.chars().collect();
163        if chars.len() < 3 {
164            return vec![name.to_string()];
165        }
166
167        chars
168            .windows(3)
169            .map(|w| w.iter().collect::<String>())
170            .collect()
171    }
172
173    /// Extract ecosystem from a PURL.
174    fn extract_ecosystem(purl: &str) -> Option<String> {
175        // PURL format: pkg:ecosystem/namespace/name@version
176        if let Some(rest) = purl.strip_prefix("pkg:") {
177            if let Some(slash_pos) = rest.find('/') {
178                return Some(rest[..slash_pos].to_lowercase());
179            }
180        }
181        None
182    }
183
184    /// Normalize a PURL for comparison.
185    fn normalize_purl(purl: &str) -> String {
186        // Basic normalization: lowercase and strip version qualifiers
187        let purl_lower = purl.to_lowercase();
188        // Remove version part for comparison if present
189        if let Some(at_pos) = purl_lower.rfind('@') {
190            purl_lower[..at_pos].to_string()
191        } else {
192            purl_lower
193        }
194    }
195
196    /// Normalize a component name for comparison.
197    ///
198    /// Applies ecosystem-specific normalization rules:
199    /// - PyPI: underscores, hyphens, dots are all equivalent (converted to hyphen)
200    /// - Cargo: hyphens and underscores are equivalent (converted to underscore)
201    /// - npm: lowercase only, preserves scope
202    /// - Default: lowercase with underscore to hyphen conversion
203    ///
204    /// This is also used by LSH for consistent shingle computation.
205    pub fn normalize_name(name: &str, ecosystem: Option<&str>) -> String {
206        let mut normalized = name.to_lowercase();
207
208        // Apply ecosystem-specific normalization
209        match ecosystem {
210            Some("pypi") => {
211                // Python: underscores, hyphens, dots are equivalent
212                normalized = normalized.replace(['_', '.'], "-");
213            }
214            Some("cargo") => {
215                // Rust: hyphens and underscores are equivalent
216                normalized = normalized.replace('-', "_");
217            }
218            Some("npm") => {
219                // npm: already lowercase, preserve scope
220                // Nothing special needed
221            }
222            _ => {
223                // Default: just lowercase, normalize common separators
224                normalized = normalized.replace('_', "-");
225            }
226        }
227
228        // Collapse multiple separators
229        while normalized.contains("--") {
230            normalized = normalized.replace("--", "-");
231        }
232
233        normalized
234    }
235
236    /// Get normalized entry for a component.
237    pub fn get_entry(&self, id: &CanonicalId) -> Option<&NormalizedEntry> {
238        // Arc<T>: Borrow<T> allows HashMap lookup with &CanonicalId
239        self.entries.get(id)
240    }
241
242    /// Get components by ecosystem.
243    ///
244    /// Returns cloned CanonicalIds for API stability. The internal storage uses Arc
245    /// to avoid expensive cloning during index building.
246    pub fn get_by_ecosystem(&self, ecosystem: &str) -> Option<Vec<CanonicalId>> {
247        self.by_ecosystem
248            .get(ecosystem)
249            .map(|v| v.iter().map(|arc| (**arc).clone()).collect())
250    }
251
252    /// Find candidate matches for a component.
253    ///
254    /// Returns a list of component IDs that are likely matches, ordered by likelihood.
255    /// Uses ecosystem and prefix-based filtering to reduce candidates.
256    ///
257    /// Returns cloned CanonicalIds for API stability. The internal storage uses Arc
258    /// to avoid expensive cloning during index building.
259    pub fn find_candidates(
260        &self,
261        source_id: &CanonicalId,
262        source_entry: &NormalizedEntry,
263        max_candidates: usize,
264        max_length_diff: usize,
265    ) -> Vec<CanonicalId> {
266        let mut candidates: Vec<Arc<CanonicalId>> = Vec::new();
267        let mut seen: HashSet<Arc<CanonicalId>> = HashSet::new();
268
269        // Priority 1: Same ecosystem candidates
270        if let Some(ref eco) = source_entry.ecosystem {
271            if let Some(ids) = self.by_ecosystem.get(eco) {
272                for id in ids {
273                    if id.as_ref() != source_id && !seen.contains(id) {
274                        // Apply length filter
275                        if let Some(entry) = self.entries.get(id.as_ref()) {
276                            let len_diff = (source_entry.name_length as i32
277                                - entry.name_length as i32)
278                                .unsigned_abs() as usize;
279                            if len_diff <= max_length_diff {
280                                candidates.push(Arc::clone(id));
281                                seen.insert(Arc::clone(id));
282                            }
283                        }
284                    }
285                }
286            }
287        }
288
289        // Priority 2: Same prefix candidates (cross-ecosystem fallback)
290        if candidates.len() < max_candidates && !source_entry.prefix.is_empty() {
291            if let Some(ids) = self.by_prefix.get(&source_entry.prefix) {
292                for id in ids {
293                    if id.as_ref() != source_id && !seen.contains(id) {
294                        if let Some(entry) = self.entries.get(id.as_ref()) {
295                            let len_diff = (source_entry.name_length as i32
296                                - entry.name_length as i32)
297                                .unsigned_abs() as usize;
298                            if len_diff <= max_length_diff {
299                                candidates.push(Arc::clone(id));
300                                seen.insert(Arc::clone(id));
301                            }
302                        }
303                    }
304                    if candidates.len() >= max_candidates {
305                        break;
306                    }
307                }
308            }
309        }
310
311        // Priority 3: Similar prefixes (1-char difference in prefix)
312        if candidates.len() < max_candidates && source_entry.prefix.len() >= 2 {
313            let prefix_2 = &source_entry.prefix[..2.min(source_entry.prefix.len())];
314            for (prefix, ids) in &self.by_prefix {
315                if prefix.starts_with(prefix_2) && prefix != &source_entry.prefix {
316                    for id in ids {
317                        if id.as_ref() != source_id && !seen.contains(id) {
318                            if let Some(entry) = self.entries.get(id.as_ref()) {
319                                let len_diff = (source_entry.name_length as i32
320                                    - entry.name_length as i32)
321                                    .unsigned_abs()
322                                    as usize;
323                                if len_diff <= max_length_diff {
324                                    candidates.push(Arc::clone(id));
325                                    seen.insert(Arc::clone(id));
326                                }
327                            }
328                        }
329                        if candidates.len() >= max_candidates {
330                            break;
331                        }
332                    }
333                }
334                if candidates.len() >= max_candidates {
335                    break;
336                }
337            }
338        }
339
340        // Priority 4: Trigram-based matching (catches middle/end differences)
341        // Find components that share multiple trigrams with the source
342        if candidates.len() < max_candidates && !source_entry.trigrams.is_empty() {
343            // Count trigram overlap for each candidate
344            let mut trigram_scores: HashMap<Arc<CanonicalId>, usize> = HashMap::new();
345
346            for trigram in &source_entry.trigrams {
347                if let Some(ids) = self.by_trigram.get(trigram) {
348                    for id in ids {
349                        if id.as_ref() != source_id && !seen.contains(id) {
350                            *trigram_scores.entry(Arc::clone(id)).or_default() += 1;
351                        }
352                    }
353                }
354            }
355
356            // Require at least 2 shared trigrams (or 1 for very short names)
357            let min_shared = if source_entry.trigrams.len() <= 2 {
358                1
359            } else {
360                2
361            };
362
363            // Sort by trigram overlap count (descending)
364            let mut scored: Vec<_> = trigram_scores
365                .into_iter()
366                .filter(|(_, count)| *count >= min_shared)
367                .collect();
368            scored.sort_by(|a, b| b.1.cmp(&a.1));
369
370            for (id, _score) in scored {
371                if candidates.len() >= max_candidates {
372                    break;
373                }
374                if let Some(entry) = self.entries.get(id.as_ref()) {
375                    let len_diff = (source_entry.name_length as i32 - entry.name_length as i32)
376                        .unsigned_abs() as usize;
377                    if len_diff <= max_length_diff {
378                        candidates.push(Arc::clone(&id));
379                        seen.insert(id);
380                    }
381                }
382            }
383        }
384
385        // Truncate to max_candidates and convert to owned CanonicalIds
386        candidates.truncate(max_candidates);
387        candidates.into_iter().map(|arc| (*arc).clone()).collect()
388    }
389
390    /// Get all component IDs (for fallback full scan).
391    ///
392    /// Returns cloned CanonicalIds for API stability.
393    pub fn all_ids(&self) -> Vec<CanonicalId> {
394        self.all_ids.iter().map(|arc| (**arc).clone()).collect()
395    }
396
397    /// Get the number of indexed components.
398    pub fn len(&self) -> usize {
399        self.entries.len()
400    }
401
402    /// Check if the index is empty.
403    pub fn is_empty(&self) -> bool {
404        self.entries.is_empty()
405    }
406
407    /// Find candidates for multiple source components in parallel.
408    ///
409    /// This is significantly faster than calling `find_candidates` sequentially
410    /// for large SBOMs (1000+ components). Uses rayon for parallel iteration.
411    ///
412    /// Returns a vector of (source_id, candidates) pairs in the same order as input.
413    pub fn find_candidates_parallel<'a>(
414        &self,
415        sources: &[(&'a CanonicalId, &NormalizedEntry)],
416        max_candidates: usize,
417        max_length_diff: usize,
418    ) -> Vec<(&'a CanonicalId, Vec<CanonicalId>)> {
419        sources
420            .par_iter()
421            .map(|(source_id, source_entry)| {
422                let candidates =
423                    self.find_candidates(source_id, source_entry, max_candidates, max_length_diff);
424                (*source_id, candidates)
425            })
426            .collect()
427    }
428
429    /// Find candidates for all components in another index in parallel.
430    ///
431    /// Useful for diffing two SBOMs: build an index from the new SBOM,
432    /// then find candidates for all components from the old SBOM.
433    pub fn find_all_candidates_from(
434        &self,
435        other: &ComponentIndex,
436        max_candidates: usize,
437        max_length_diff: usize,
438    ) -> Vec<(CanonicalId, Vec<CanonicalId>)> {
439        let sources: Vec<_> = other
440            .entries
441            .iter()
442            .collect();
443
444        sources
445            .par_iter()
446            .map(|(source_id, source_entry)| {
447                let candidates =
448                    self.find_candidates(source_id, source_entry, max_candidates, max_length_diff);
449                // Clone the inner CanonicalId from the Arc
450                ((*source_id).as_ref().clone(), candidates)
451            })
452            .collect::<Vec<_>>()
453    }
454
455    /// Get statistics about the index.
456    pub fn stats(&self) -> IndexStats {
457        let ecosystems = self.by_ecosystem.len();
458        let prefixes = self.by_prefix.len();
459        let trigrams = self.by_trigram.len();
460        let avg_per_ecosystem = if ecosystems > 0 {
461            self.by_ecosystem.values().map(|v| v.len()).sum::<usize>() / ecosystems
462        } else {
463            0
464        };
465        let avg_per_prefix = if prefixes > 0 {
466            self.by_prefix.values().map(|v| v.len()).sum::<usize>() / prefixes
467        } else {
468            0
469        };
470        let avg_per_trigram = if trigrams > 0 {
471            self.by_trigram.values().map(|v| v.len()).sum::<usize>() / trigrams
472        } else {
473            0
474        };
475
476        IndexStats {
477            total_components: self.entries.len(),
478            ecosystems,
479            prefixes,
480            trigrams,
481            avg_per_ecosystem,
482            avg_per_prefix,
483            avg_per_trigram,
484        }
485    }
486
487    /// Compute trigram similarity between two entries (Jaccard coefficient).
488    ///
489    /// Returns a value between 0.0 and 1.0 where 1.0 means identical trigram sets.
490    pub fn trigram_similarity(entry_a: &NormalizedEntry, entry_b: &NormalizedEntry) -> f64 {
491        if entry_a.trigrams.is_empty() || entry_b.trigrams.is_empty() {
492            return 0.0;
493        }
494
495        let set_a: HashSet<_> = entry_a.trigrams.iter().collect();
496        let set_b: HashSet<_> = entry_b.trigrams.iter().collect();
497
498        let intersection = set_a.intersection(&set_b).count();
499        let union = set_a.union(&set_b).count();
500
501        if union == 0 {
502            0.0
503        } else {
504            intersection as f64 / union as f64
505        }
506    }
507}
508
509/// Statistics about the component index.
510#[derive(Debug, Clone)]
511pub struct IndexStats {
512    /// Total number of indexed components
513    pub total_components: usize,
514    /// Number of unique ecosystems
515    pub ecosystems: usize,
516    /// Number of unique prefixes
517    pub prefixes: usize,
518    /// Number of unique trigrams
519    pub trigrams: usize,
520    /// Average components per ecosystem
521    pub avg_per_ecosystem: usize,
522    /// Average components per prefix
523    pub avg_per_prefix: usize,
524    /// Average components per trigram
525    pub avg_per_trigram: usize,
526}
527
528/// Batch candidate generator that combines multiple indexing strategies.
529///
530/// For best recall, combines:
531/// 1. ComponentIndex (ecosystem, prefix, trigram-based)
532/// 2. LSH index (for large SBOMs, catches approximate matches)
533/// 3. Cross-ecosystem mappings (optional)
534///
535/// The candidates from each source are deduplicated and merged.
536pub struct BatchCandidateGenerator {
537    /// Primary component index
538    component_index: ComponentIndex,
539    /// Optional LSH index for large SBOMs
540    lsh_index: Option<super::lsh::LshIndex>,
541    /// Optional cross-ecosystem database
542    cross_ecosystem_db: Option<super::cross_ecosystem::CrossEcosystemDb>,
543    /// Configuration
544    config: BatchCandidateConfig,
545}
546
547/// Configuration for batch candidate generation.
548#[derive(Debug, Clone)]
549pub struct BatchCandidateConfig {
550    /// Maximum candidates per source component
551    pub max_candidates: usize,
552    /// Maximum name length difference
553    pub max_length_diff: usize,
554    /// Minimum SBOM size to enable LSH (smaller SBOMs don't benefit)
555    pub lsh_threshold: usize,
556    /// Enable cross-ecosystem matching
557    pub enable_cross_ecosystem: bool,
558}
559
560impl Default for BatchCandidateConfig {
561    fn default() -> Self {
562        Self {
563            max_candidates: 100,
564            max_length_diff: 5,
565            lsh_threshold: 500, // Only use LSH for SBOMs with 500+ components
566            enable_cross_ecosystem: true,
567        }
568    }
569}
570
571/// Result of batch candidate generation.
572#[derive(Debug)]
573pub struct BatchCandidateResult {
574    /// Source component ID
575    pub source_id: CanonicalId,
576    /// Candidates from component index
577    pub index_candidates: Vec<CanonicalId>,
578    /// Additional candidates from LSH (not in index_candidates)
579    pub lsh_candidates: Vec<CanonicalId>,
580    /// Cross-ecosystem candidates (if different ecosystems)
581    pub cross_ecosystem_candidates: Vec<CanonicalId>,
582    /// Total unique candidates
583    pub total_unique: usize,
584}
585
586impl BatchCandidateGenerator {
587    /// Create a new batch candidate generator from an SBOM.
588    pub fn build(sbom: &NormalizedSbom, config: BatchCandidateConfig) -> Self {
589        let component_index = ComponentIndex::build(sbom);
590
591        // Only build LSH index for large SBOMs
592        let lsh_index = if sbom.component_count() >= config.lsh_threshold {
593            Some(super::lsh::LshIndex::build(
594                sbom,
595                super::lsh::LshConfig::default(),
596            ))
597        } else {
598            None
599        };
600
601        // Optionally load cross-ecosystem database
602        let cross_ecosystem_db = if config.enable_cross_ecosystem {
603            Some(super::cross_ecosystem::CrossEcosystemDb::with_builtin_mappings())
604        } else {
605            None
606        };
607
608        Self {
609            component_index,
610            lsh_index,
611            cross_ecosystem_db,
612            config,
613        }
614    }
615
616    /// Generate candidates for a single component.
617    pub fn find_candidates(
618        &self,
619        source_id: &CanonicalId,
620        source_component: &Component,
621    ) -> BatchCandidateResult {
622        let mut seen: HashSet<CanonicalId> = HashSet::new();
623
624        // Get normalized entry for the source
625        let source_entry = if let Some(entry) = self.component_index.get_entry(source_id) {
626            entry.clone()
627        } else {
628            // Build entry on the fly if not in our index (source from different SBOM)
629            ComponentIndex::normalize_component(source_component)
630        };
631
632        // 1. Component index candidates
633        let index_candidates = self.component_index.find_candidates(
634            source_id,
635            &source_entry,
636            self.config.max_candidates,
637            self.config.max_length_diff,
638        );
639        for id in &index_candidates {
640            seen.insert(id.clone());
641        }
642
643        // 2. LSH candidates (additional ones not found by component index)
644        let lsh_candidates: Vec<CanonicalId> = if let Some(ref lsh) = self.lsh_index {
645            let candidates: Vec<_> = lsh
646                .find_candidates(source_component)
647                .into_iter()
648                .filter(|id| id != source_id && !seen.contains(id))
649                .take(self.config.max_candidates / 2) // Limit LSH additions
650                .collect();
651            for id in &candidates {
652                seen.insert(id.clone());
653            }
654            candidates
655        } else {
656            Vec::new()
657        };
658
659        // 3. Cross-ecosystem candidates
660        let cross_ecosystem_candidates: Vec<CanonicalId> =
661            if let (Some(ref db), Some(ref eco)) =
662                (&self.cross_ecosystem_db, &source_component.ecosystem)
663            {
664                let candidates: Vec<_> = db
665                    .find_equivalents(eco, &source_component.name)
666                    .into_iter()
667                    .flat_map(|m| {
668                        // Look up components with these names in our index
669                        let target_eco_str = m.target_ecosystem.to_string().to_lowercase();
670                        self.component_index
671                            .get_by_ecosystem(&target_eco_str)
672                            .unwrap_or_default()
673                    })
674                    .filter(|id| id != source_id && !seen.contains(id))
675                    .take(self.config.max_candidates / 4) // Limit cross-ecosystem
676                    .collect();
677                for id in &candidates {
678                    seen.insert(id.clone());
679                }
680                candidates
681            } else {
682                Vec::new()
683            };
684
685        let total_unique = seen.len();
686
687        BatchCandidateResult {
688            source_id: source_id.clone(),
689            index_candidates,
690            lsh_candidates,
691            cross_ecosystem_candidates,
692            total_unique,
693        }
694    }
695
696    /// Generate candidates for multiple components in parallel.
697    pub fn find_candidates_batch(
698        &self,
699        sources: &[(&CanonicalId, &Component)],
700    ) -> Vec<BatchCandidateResult> {
701        sources
702            .par_iter()
703            .map(|(id, comp)| self.find_candidates(id, comp))
704            .collect()
705    }
706
707    /// Get all unique candidates (deduplicated across all strategies).
708    pub fn all_candidates(
709        &self,
710        source_id: &CanonicalId,
711        source_component: &Component,
712    ) -> Vec<CanonicalId> {
713        let result = self.find_candidates(source_id, source_component);
714        let mut all: Vec<_> = result.index_candidates;
715        all.extend(result.lsh_candidates);
716        all.extend(result.cross_ecosystem_candidates);
717        all
718    }
719
720    /// Get the underlying component index.
721    pub fn component_index(&self) -> &ComponentIndex {
722        &self.component_index
723    }
724
725    /// Check if LSH is enabled.
726    pub fn has_lsh(&self) -> bool {
727        self.lsh_index.is_some()
728    }
729
730    /// Check if cross-ecosystem matching is enabled.
731    pub fn has_cross_ecosystem(&self) -> bool {
732        self.cross_ecosystem_db.is_some()
733    }
734
735    /// Get statistics about the generator.
736    pub fn stats(&self) -> BatchCandidateStats {
737        BatchCandidateStats {
738            index_stats: self.component_index.stats(),
739            lsh_enabled: self.lsh_index.is_some(),
740            lsh_stats: self.lsh_index.as_ref().map(|l| l.stats()),
741            cross_ecosystem_enabled: self.cross_ecosystem_db.is_some(),
742        }
743    }
744}
745
746/// Statistics about the batch candidate generator.
747#[derive(Debug)]
748pub struct BatchCandidateStats {
749    /// Component index statistics
750    pub index_stats: IndexStats,
751    /// Whether LSH is enabled
752    pub lsh_enabled: bool,
753    /// LSH statistics (if enabled)
754    pub lsh_stats: Option<super::lsh::LshIndexStats>,
755    /// Whether cross-ecosystem matching is enabled
756    pub cross_ecosystem_enabled: bool,
757}
758
759/// A lazily-built component index that only constructs the index on first use.
760///
761/// This is useful when the index might not be needed (e.g., when doing simple
762/// exact-match only comparisons), or when construction should be deferred.
763pub struct LazyComponentIndex {
764    /// The SBOM to index (stored for deferred building)
765    sbom: Option<std::sync::Arc<NormalizedSbom>>,
766    /// The built index (populated on first access)
767    index: std::sync::OnceLock<ComponentIndex>,
768}
769
770impl LazyComponentIndex {
771    /// Create a new lazy index that will build from the given SBOM on first access.
772    pub fn new(sbom: std::sync::Arc<NormalizedSbom>) -> Self {
773        Self {
774            sbom: Some(sbom),
775            index: std::sync::OnceLock::new(),
776        }
777    }
778
779    /// Create a lazy index from an already-built ComponentIndex.
780    pub fn from_index(index: ComponentIndex) -> Self {
781        let lazy = Self {
782            sbom: None,
783            index: std::sync::OnceLock::new(),
784        };
785        let _ = lazy.index.set(index);
786        lazy
787    }
788
789    /// Get the index, building it if necessary.
790    ///
791    /// This is safe to call from multiple threads - the index will only
792    /// be built once.
793    pub fn get(&self) -> &ComponentIndex {
794        self.index.get_or_init(|| {
795            if let Some(ref sbom) = self.sbom {
796                ComponentIndex::build(sbom)
797            } else {
798                // Empty index as fallback (shouldn't happen in normal use)
799                ComponentIndex::build(&NormalizedSbom::default())
800            }
801        })
802    }
803
804    /// Check if the index has been built yet.
805    pub fn is_built(&self) -> bool {
806        self.index.get().is_some()
807    }
808
809    /// Get the index if already built, without triggering a build.
810    pub fn try_get(&self) -> Option<&ComponentIndex> {
811        self.index.get()
812    }
813}
814
815impl std::ops::Deref for LazyComponentIndex {
816    type Target = ComponentIndex;
817
818    fn deref(&self) -> &Self::Target {
819        self.get()
820    }
821}
822
823#[cfg(test)]
824mod tests {
825    use super::*;
826    use crate::model::{DocumentMetadata, Ecosystem};
827
828    fn make_component(name: &str, purl: Option<&str>) -> Component {
829        let mut comp = Component::new(name.to_string(), format!("test-{}", name));
830        comp.version = Some("1.0.0".to_string());
831        comp.identifiers.purl = purl.map(|s| s.to_string());
832        // Convert extracted ecosystem string to Ecosystem enum
833        comp.ecosystem = purl
834            .and_then(|p| ComponentIndex::extract_ecosystem(p))
835            .map(|eco_str| Ecosystem::from_purl_type(&eco_str));
836        comp
837    }
838
839    #[test]
840    fn test_extract_ecosystem() {
841        assert_eq!(
842            ComponentIndex::extract_ecosystem("pkg:pypi/requests@2.28.0"),
843            Some("pypi".to_string())
844        );
845        assert_eq!(
846            ComponentIndex::extract_ecosystem("pkg:npm/@angular/core@14.0.0"),
847            Some("npm".to_string())
848        );
849        assert_eq!(
850            ComponentIndex::extract_ecosystem("pkg:cargo/serde@1.0.0"),
851            Some("cargo".to_string())
852        );
853    }
854
855    #[test]
856    fn test_normalize_name_pypi() {
857        assert_eq!(
858            ComponentIndex::normalize_name("Python_Dateutil", Some("pypi")),
859            "python-dateutil"
860        );
861        assert_eq!(
862            ComponentIndex::normalize_name("Some.Package", Some("pypi")),
863            "some-package"
864        );
865    }
866
867    #[test]
868    fn test_normalize_name_cargo() {
869        assert_eq!(
870            ComponentIndex::normalize_name("serde-json", Some("cargo")),
871            "serde_json"
872        );
873    }
874
875    #[test]
876    fn test_build_index() {
877        let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
878
879        let comp1 = make_component("requests", Some("pkg:pypi/requests@2.28.0"));
880        let comp2 = make_component("urllib3", Some("pkg:pypi/urllib3@1.26.0"));
881        let comp3 = make_component("serde", Some("pkg:cargo/serde@1.0.0"));
882
883        sbom.add_component(comp1);
884        sbom.add_component(comp2);
885        sbom.add_component(comp3);
886
887        let index = ComponentIndex::build(&sbom);
888
889        assert_eq!(index.len(), 3);
890        assert_eq!(index.by_ecosystem.get("pypi").map(|v| v.len()), Some(2));
891        assert_eq!(index.by_ecosystem.get("cargo").map(|v| v.len()), Some(1));
892    }
893
894    #[test]
895    fn test_find_candidates_same_ecosystem() {
896        let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
897
898        let comp1 = make_component("requests", Some("pkg:pypi/requests@2.28.0"));
899        let comp2 = make_component("urllib3", Some("pkg:pypi/urllib3@1.26.0"));
900        let comp3 = make_component("flask", Some("pkg:pypi/flask@2.0.0"));
901        let comp4 = make_component("serde", Some("pkg:cargo/serde@1.0.0"));
902
903        sbom.add_component(comp1.clone());
904        sbom.add_component(comp2);
905        sbom.add_component(comp3);
906        sbom.add_component(comp4);
907
908        let index = ComponentIndex::build(&sbom);
909
910        // Get the ID for requests
911        let requests_id = sbom
912            .components
913            .keys()
914            .find(|id| {
915                sbom.components
916                    .get(*id)
917                    .map(|c| c.name == "requests")
918                    .unwrap_or(false)
919            })
920            .unwrap();
921
922        let entry = index.get_entry(requests_id).unwrap();
923        let candidates = index.find_candidates(requests_id, entry, 10, 5);
924
925        // Should find pypi packages, not cargo packages
926        assert!(candidates.len() >= 2);
927        for cand_id in &candidates {
928            let cand_entry = index.get_entry(cand_id).unwrap();
929            assert_eq!(cand_entry.ecosystem, Some("pypi".to_string()));
930        }
931    }
932
933    #[test]
934    fn test_compute_trigrams() {
935        // Normal case
936        let trigrams = ComponentIndex::compute_trigrams("lodash");
937        assert_eq!(trigrams, vec!["lod", "oda", "das", "ash"]);
938
939        // Short name (< 3 chars)
940        let trigrams = ComponentIndex::compute_trigrams("ab");
941        assert_eq!(trigrams, vec!["ab"]);
942
943        // Empty name
944        let trigrams = ComponentIndex::compute_trigrams("");
945        assert!(trigrams.is_empty());
946
947        // Exactly 3 chars
948        let trigrams = ComponentIndex::compute_trigrams("abc");
949        assert_eq!(trigrams, vec!["abc"]);
950    }
951
952    #[test]
953    fn test_trigram_similarity() {
954        let entry_a = NormalizedEntry {
955            normalized_purl: None,
956            normalized_name: "lodash".to_string(),
957            name_length: 6,
958            ecosystem: None,
959            prefix: "lod".to_string(),
960            trigrams: vec![
961                "lod".to_string(),
962                "oda".to_string(),
963                "das".to_string(),
964                "ash".to_string(),
965            ],
966        };
967
968        let entry_b = NormalizedEntry {
969            normalized_purl: None,
970            normalized_name: "lodash-es".to_string(),
971            name_length: 9,
972            ecosystem: None,
973            prefix: "lod".to_string(),
974            trigrams: vec![
975                "lod".to_string(),
976                "oda".to_string(),
977                "das".to_string(),
978                "ash".to_string(),
979                "sh-".to_string(),
980                "h-e".to_string(),
981                "-es".to_string(),
982            ],
983        };
984
985        let similarity = ComponentIndex::trigram_similarity(&entry_a, &entry_b);
986        // lodash has 4 trigrams, lodash-es has 7, they share 4
987        // Jaccard = 4 / 7 ≈ 0.57
988        assert!(
989            similarity > 0.5 && similarity < 0.6,
990            "Expected ~0.57, got {}",
991            similarity
992        );
993
994        // Identical entries should have similarity 1.0
995        let same_similarity = ComponentIndex::trigram_similarity(&entry_a, &entry_a);
996        assert!((same_similarity - 1.0).abs() < f64::EPSILON);
997
998        // Completely different entries should have low similarity
999        let entry_c = NormalizedEntry {
1000            normalized_purl: None,
1001            normalized_name: "react".to_string(),
1002            name_length: 5,
1003            ecosystem: None,
1004            prefix: "rea".to_string(),
1005            trigrams: vec!["rea".to_string(), "eac".to_string(), "act".to_string()],
1006        };
1007
1008        let diff_similarity = ComponentIndex::trigram_similarity(&entry_a, &entry_c);
1009        assert!(
1010            diff_similarity < 0.1,
1011            "Expected low similarity, got {}",
1012            diff_similarity
1013        );
1014    }
1015
1016    #[test]
1017    fn test_trigram_index_find_similar_suffix() {
1018        // Test that trigram indexing can find packages with different prefixes but similar content
1019        let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
1020
1021        // These packages share trigrams in the middle/end
1022        let comp1 = make_component("react-dom", Some("pkg:npm/react-dom@18.0.0"));
1023        let comp2 = make_component("preact-dom", Some("pkg:npm/preact-dom@10.0.0")); // shares "act", "-do", "dom"
1024        let comp3 = make_component("angular", Some("pkg:npm/angular@15.0.0")); // completely different
1025
1026        sbom.add_component(comp1.clone());
1027        sbom.add_component(comp2);
1028        sbom.add_component(comp3);
1029
1030        let index = ComponentIndex::build(&sbom);
1031
1032        // Find ID for react-dom
1033        let react_id = sbom
1034            .components
1035            .keys()
1036            .find(|id| {
1037                sbom.components
1038                    .get(*id)
1039                    .map(|c| c.name == "react-dom")
1040                    .unwrap_or(false)
1041            })
1042            .unwrap();
1043
1044        let entry = index.get_entry(react_id).unwrap();
1045
1046        // Should find preact-dom via trigram matching even though prefix differs
1047        let candidates = index.find_candidates(react_id, entry, 10, 5);
1048
1049        let preact_found = candidates.iter().any(|id| {
1050            index
1051                .get_entry(id)
1052                .map(|e| e.normalized_name.contains("preact"))
1053                .unwrap_or(false)
1054        });
1055
1056        assert!(preact_found, "Should find preact-dom via trigram matching");
1057    }
1058}