Skip to main content

sbom_tools/matching/
index.rs

1//! Component index for efficient matching.
2//!
3//! This module provides indexing structures to reduce O(n²) fuzzy comparisons
4//! by pre-normalizing and bucketing components for efficient candidate lookup.
5
6use crate::model::{CanonicalId, Component, NormalizedSbom};
7use rayon::prelude::*;
8use std::collections::{HashMap, HashSet};
9use std::sync::Arc;
10
11/// Pre-computed normalized data for a component.
12#[derive(Debug, Clone)]
13pub struct NormalizedEntry {
14    /// Normalized PURL (if available)
15    pub normalized_purl: Option<String>,
16    /// Normalized component name (lowercase, separators normalized)
17    pub normalized_name: String,
18    /// Length of the normalized name (for length-based filtering)
19    pub name_length: usize,
20    /// Ecosystem extracted from PURL or inferred
21    pub ecosystem: Option<String>,
22    /// First 3 characters of normalized name (for prefix bucketing)
23    pub prefix: String,
24    /// Trigrams (3-character substrings) for fuzzy matching
25    pub trigrams: Vec<String>,
26}
27
28/// Index for efficient component candidate lookup.
29///
30/// Reduces the O(n·m) comparison to O(n·k) where k << m by:
31/// 1. Grouping components by ecosystem
32/// 2. Bucketing by name prefix
33/// 3. Bucketing by trigrams (3-char substrings) for fuzzy matching
34/// 4. Pre-normalizing names for fast comparison
35///
36/// Uses `Arc<CanonicalId>` internally for efficient cloning during index building.
37pub struct ComponentIndex {
38    /// Ecosystem -> list of component IDs in that ecosystem
39    by_ecosystem: HashMap<String, Vec<Arc<CanonicalId>>>,
40    /// Normalized name prefix (first 3 chars) -> component IDs
41    by_prefix: HashMap<String, Vec<Arc<CanonicalId>>>,
42    /// Trigram -> list of component IDs containing that trigram
43    by_trigram: HashMap<String, Vec<Arc<CanonicalId>>>,
44    /// Pre-computed normalized data for each component
45    entries: HashMap<Arc<CanonicalId>, NormalizedEntry>,
46    /// All component IDs (for fallback)
47    all_ids: Vec<Arc<CanonicalId>>,
48}
49
50impl ComponentIndex {
51    /// Build an index from an SBOM.
52    ///
53    /// Uses `Arc<CanonicalId>` internally to avoid expensive cloning of IDs
54    /// across multiple index structures.
55    #[must_use]
56    pub fn build(sbom: &NormalizedSbom) -> Self {
57        let mut by_ecosystem: HashMap<String, Vec<Arc<CanonicalId>>> = HashMap::new();
58        let mut by_prefix: HashMap<String, Vec<Arc<CanonicalId>>> = HashMap::new();
59        let mut by_trigram: HashMap<String, Vec<Arc<CanonicalId>>> = HashMap::new();
60        let mut entries: HashMap<Arc<CanonicalId>, NormalizedEntry> = HashMap::new();
61        let mut all_ids: Vec<Arc<CanonicalId>> = Vec::new();
62
63        for (id, comp) in &sbom.components {
64            let entry = Self::normalize_component(comp);
65            // Wrap ID in Arc once - all subsequent "clones" are cheap reference count increments
66            let arc_id = Arc::new(id.clone());
67
68            // Index by ecosystem
69            if let Some(ref eco) = entry.ecosystem {
70                by_ecosystem
71                    .entry(eco.clone())
72                    .or_default()
73                    .push(Arc::clone(&arc_id));
74            }
75
76            // Index by name prefix
77            if !entry.prefix.is_empty() {
78                by_prefix
79                    .entry(entry.prefix.clone())
80                    .or_default()
81                    .push(Arc::clone(&arc_id));
82            }
83
84            // Index by trigrams
85            for trigram in &entry.trigrams {
86                by_trigram
87                    .entry(trigram.clone())
88                    .or_default()
89                    .push(Arc::clone(&arc_id));
90            }
91
92            entries.insert(Arc::clone(&arc_id), entry);
93            all_ids.push(arc_id);
94        }
95
96        Self {
97            by_ecosystem,
98            by_prefix,
99            by_trigram,
100            entries,
101            all_ids,
102        }
103    }
104
105    /// Normalize a component for indexing.
106    #[must_use]
107    pub fn normalize_component(comp: &Component) -> NormalizedEntry {
108        // Extract ecosystem from PURL
109        let (ecosystem, normalized_purl) = comp.identifiers.purl.as_ref().map_or_else(
110            || {
111                // Try to infer ecosystem from component type or other fields
112                // Convert Ecosystem enum to String for consistent comparison
113                (
114                    comp.ecosystem
115                        .as_ref()
116                        .map(std::string::ToString::to_string),
117                    None,
118                )
119            },
120            |purl| {
121                let eco = Self::extract_ecosystem(purl);
122                let normalized = Self::normalize_purl(purl);
123                (eco, Some(normalized))
124            },
125        );
126
127        // Normalize name
128        let normalized_name = Self::normalize_name(&comp.name, ecosystem.as_deref());
129        let name_length = normalized_name.len();
130        let prefix = normalized_name.chars().take(3).collect::<String>();
131        let trigrams = Self::compute_trigrams(&normalized_name);
132
133        NormalizedEntry {
134            normalized_purl,
135            normalized_name,
136            name_length,
137            ecosystem,
138            prefix,
139            trigrams,
140        }
141    }
142
143    /// Compute trigrams (3-character substrings) for a normalized name.
144    ///
145    /// Trigrams enable finding matches where only the middle or end differs,
146    /// which prefix-based indexing would miss.
147    fn compute_trigrams(name: &str) -> Vec<String> {
148        if name.len() < 3 {
149            // For very short names, use the name itself as a "trigram"
150            return if name.is_empty() {
151                vec![]
152            } else {
153                vec![name.to_string()]
154            };
155        }
156
157        // Fast path: ASCII-only names (common for package names)
158        // Avoids intermediate Vec<char> allocation
159        if name.is_ascii() {
160            return name
161                .as_bytes()
162                .windows(3)
163                .map(|w| {
164                    // SAFETY: name.is_ascii() was checked above, so all bytes are valid
165                    // single-byte UTF-8 characters. Any 3-byte window is valid UTF-8.
166                    unsafe { std::str::from_utf8_unchecked(w) }.to_string()
167                })
168                .collect();
169        }
170
171        // Slow path: Unicode names - need to collect chars first for windows()
172        let chars: Vec<char> = name.chars().collect();
173        if chars.len() < 3 {
174            return vec![name.to_string()];
175        }
176
177        chars
178            .windows(3)
179            .map(|w| w.iter().collect::<String>())
180            .collect()
181    }
182
183    /// Extract ecosystem from a PURL.
184    fn extract_ecosystem(purl: &str) -> Option<String> {
185        // PURL format: pkg:ecosystem/namespace/name@version
186        if let Some(rest) = purl.strip_prefix("pkg:")
187            && let Some(slash_pos) = rest.find('/')
188        {
189            return Some(rest[..slash_pos].to_lowercase());
190        }
191        None
192    }
193
194    /// Normalize a PURL for comparison.
195    fn normalize_purl(purl: &str) -> String {
196        // Basic normalization: lowercase and strip version qualifiers
197        let purl_lower = purl.to_lowercase();
198        // Remove version part for comparison if present
199        if let Some(at_pos) = purl_lower.rfind('@') {
200            purl_lower[..at_pos].to_string()
201        } else {
202            purl_lower
203        }
204    }
205
206    /// Normalize a component name for comparison.
207    ///
208    /// Applies ecosystem-specific normalization rules:
209    /// - `PyPI`: underscores, hyphens, dots are all equivalent (converted to hyphen)
210    /// - Cargo: hyphens and underscores are equivalent (converted to underscore)
211    /// - npm: lowercase only, preserves scope
212    /// - Default: lowercase with underscore to hyphen conversion
213    ///
214    /// This is also used by LSH for consistent shingle computation.
215    #[must_use]
216    pub fn normalize_name(name: &str, ecosystem: Option<&str>) -> String {
217        let mut normalized = name.to_lowercase();
218
219        // Apply ecosystem-specific normalization
220        match ecosystem {
221            Some("pypi") => {
222                // Python: underscores, hyphens, dots are equivalent
223                normalized = normalized.replace(['_', '.'], "-");
224            }
225            Some("cargo") => {
226                // Rust: hyphens and underscores are equivalent
227                normalized = normalized.replace('-', "_");
228            }
229            Some("npm") => {
230                // npm: already lowercase, preserve scope
231                // Nothing special needed
232            }
233            _ => {
234                // Default: just lowercase, normalize common separators
235                normalized = normalized.replace('_', "-");
236            }
237        }
238
239        // Collapse multiple separators
240        while normalized.contains("--") {
241            normalized = normalized.replace("--", "-");
242        }
243
244        normalized
245    }
246
247    /// Get normalized entry for a component.
248    #[must_use]
249    pub fn get_entry(&self, id: &CanonicalId) -> Option<&NormalizedEntry> {
250        // Arc<T>: Borrow<T> allows HashMap lookup with &CanonicalId
251        self.entries.get(id)
252    }
253
254    /// Get components by ecosystem.
255    ///
256    /// Returns cloned `CanonicalIds` for API stability. The internal storage uses Arc
257    /// to avoid expensive cloning during index building.
258    #[must_use]
259    pub fn get_by_ecosystem(&self, ecosystem: &str) -> Option<Vec<CanonicalId>> {
260        self.by_ecosystem
261            .get(ecosystem)
262            .map(|v| v.iter().map(|arc| (**arc).clone()).collect())
263    }
264
265    /// Find candidate matches for a component.
266    ///
267    /// Returns a list of component IDs that are likely matches, ordered by likelihood.
268    /// Uses ecosystem and prefix-based filtering to reduce candidates.
269    ///
270    /// Returns cloned `CanonicalIds` for API stability. The internal storage uses Arc
271    /// to avoid expensive cloning during index building.
272    #[must_use]
273    pub fn find_candidates(
274        &self,
275        source_id: &CanonicalId,
276        source_entry: &NormalizedEntry,
277        max_candidates: usize,
278        max_length_diff: usize,
279    ) -> Vec<CanonicalId> {
280        let mut candidates: Vec<Arc<CanonicalId>> = Vec::new();
281        let mut seen: HashSet<Arc<CanonicalId>> = HashSet::new();
282
283        // Pre-compute the source trigram set once for ranking within buckets.
284        let source_trigrams: HashSet<&str> =
285            source_entry.trigrams.iter().map(String::as_str).collect();
286
287        // Priority 1: Same ecosystem candidates.
288        //
289        // Rank by trigram overlap with the source *before* the global
290        // truncate(max_candidates), so the true match survives when an
291        // ecosystem bucket is larger than max_candidates (insertion order would
292        // otherwise cut the real match purely by where it landed in the SBOM).
293        if let Some(ref eco) = source_entry.ecosystem
294            && let Some(ids) = self.by_ecosystem.get(eco)
295        {
296            let mut ranked: Vec<(usize, &Arc<CanonicalId>)> = Vec::new();
297            for id in ids {
298                if id.as_ref() != source_id
299                    && !seen.contains(id)
300                    && let Some(entry) = self.entries.get(id.as_ref())
301                {
302                    let len_diff = (source_entry.name_length as i32 - entry.name_length as i32)
303                        .unsigned_abs() as usize;
304                    if len_diff <= max_length_diff {
305                        let overlap = entry
306                            .trigrams
307                            .iter()
308                            .filter(|t| source_trigrams.contains(t.as_str()))
309                            .count();
310                        ranked.push((overlap, id));
311                    }
312                }
313            }
314            // Higher overlap first; ties broken by ID for deterministic output.
315            ranked.sort_by(|a, b| b.0.cmp(&a.0).then_with(|| a.1.value().cmp(b.1.value())));
316            for (_, id) in ranked {
317                candidates.push(Arc::clone(id));
318                seen.insert(Arc::clone(id));
319            }
320        }
321
322        // Priority 2: Same prefix candidates (cross-ecosystem fallback)
323        if candidates.len() < max_candidates
324            && !source_entry.prefix.is_empty()
325            && let Some(ids) = self.by_prefix.get(&source_entry.prefix)
326        {
327            for id in ids {
328                if id.as_ref() != source_id
329                    && !seen.contains(id)
330                    && let Some(entry) = self.entries.get(id.as_ref())
331                {
332                    let len_diff = (source_entry.name_length as i32 - entry.name_length as i32)
333                        .unsigned_abs() as usize;
334                    if len_diff <= max_length_diff {
335                        candidates.push(Arc::clone(id));
336                        seen.insert(Arc::clone(id));
337                    }
338                }
339                if candidates.len() >= max_candidates {
340                    break;
341                }
342            }
343        }
344
345        // Priority 3: Similar prefixes (1-char difference in prefix)
346        // Iterated in sorted prefix order so truncation is deterministic
347        if candidates.len() < max_candidates && source_entry.prefix.len() >= 2 {
348            let prefix_2 = &source_entry.prefix[..2.min(source_entry.prefix.len())];
349            let mut similar_prefixes: Vec<_> = self
350                .by_prefix
351                .iter()
352                .filter(|(prefix, _)| {
353                    prefix.starts_with(prefix_2) && *prefix != &source_entry.prefix
354                })
355                .collect();
356            similar_prefixes.sort_by(|a, b| a.0.cmp(b.0));
357
358            for (_prefix, ids) in similar_prefixes {
359                for id in ids {
360                    if id.as_ref() != source_id
361                        && !seen.contains(id)
362                        && let Some(entry) = self.entries.get(id.as_ref())
363                    {
364                        let len_diff = (source_entry.name_length as i32 - entry.name_length as i32)
365                            .unsigned_abs() as usize;
366                        if len_diff <= max_length_diff {
367                            candidates.push(Arc::clone(id));
368                            seen.insert(Arc::clone(id));
369                        }
370                    }
371                    if candidates.len() >= max_candidates {
372                        break;
373                    }
374                }
375                if candidates.len() >= max_candidates {
376                    break;
377                }
378            }
379        }
380
381        // Priority 4: Trigram-based matching (catches middle/end differences)
382        // Find components that share multiple trigrams with the source
383        if candidates.len() < max_candidates && !source_entry.trigrams.is_empty() {
384            // Count trigram overlap for each candidate
385            let mut trigram_scores: HashMap<Arc<CanonicalId>, usize> = HashMap::new();
386
387            for trigram in &source_entry.trigrams {
388                if let Some(ids) = self.by_trigram.get(trigram) {
389                    for id in ids {
390                        if id.as_ref() != source_id && !seen.contains(id) {
391                            *trigram_scores.entry(Arc::clone(id)).or_default() += 1;
392                        }
393                    }
394                }
395            }
396
397            // Require at least 2 shared trigrams (or 1 for very short names)
398            let min_shared = if source_entry.trigrams.len() <= 2 {
399                1
400            } else {
401                2
402            };
403
404            // Sort by trigram overlap count (descending), breaking ties by ID
405            // so truncation is deterministic
406            let mut scored: Vec<_> = trigram_scores
407                .into_iter()
408                .filter(|(_, count)| *count >= min_shared)
409                .collect();
410            scored.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.value().cmp(b.0.value())));
411
412            for (id, _score) in scored {
413                if candidates.len() >= max_candidates {
414                    break;
415                }
416                if let Some(entry) = self.entries.get(id.as_ref()) {
417                    let len_diff = (source_entry.name_length as i32 - entry.name_length as i32)
418                        .unsigned_abs() as usize;
419                    if len_diff <= max_length_diff {
420                        candidates.push(Arc::clone(&id));
421                        seen.insert(id);
422                    }
423                }
424            }
425        }
426
427        // Truncate to max_candidates and convert to owned CanonicalIds
428        candidates.truncate(max_candidates);
429        candidates.into_iter().map(|arc| (*arc).clone()).collect()
430    }
431
432    /// Get all component IDs (for fallback full scan).
433    ///
434    /// Returns cloned `CanonicalIds` for API stability.
435    #[must_use]
436    pub fn all_ids(&self) -> Vec<CanonicalId> {
437        self.all_ids.iter().map(|arc| (**arc).clone()).collect()
438    }
439
440    /// Get the number of indexed components.
441    #[must_use]
442    pub fn len(&self) -> usize {
443        self.entries.len()
444    }
445
446    /// Check if the index is empty.
447    #[must_use]
448    pub fn is_empty(&self) -> bool {
449        self.entries.is_empty()
450    }
451
452    /// Find candidates for multiple source components in parallel.
453    ///
454    /// This is significantly faster than calling `find_candidates` sequentially
455    /// for large SBOMs (1000+ components). Uses rayon for parallel iteration.
456    ///
457    /// Returns a vector of (`source_id`, candidates) pairs in the same order as input.
458    #[must_use]
459    pub fn find_candidates_parallel<'a>(
460        &self,
461        sources: &[(&'a CanonicalId, &NormalizedEntry)],
462        max_candidates: usize,
463        max_length_diff: usize,
464    ) -> Vec<(&'a CanonicalId, Vec<CanonicalId>)> {
465        sources
466            .par_iter()
467            .map(|(source_id, source_entry)| {
468                let candidates =
469                    self.find_candidates(source_id, source_entry, max_candidates, max_length_diff);
470                (*source_id, candidates)
471            })
472            .collect()
473    }
474
475    /// Find candidates for all components in another index in parallel.
476    ///
477    /// Useful for diffing two SBOMs: build an index from the new SBOM,
478    /// then find candidates for all components from the old SBOM.
479    #[must_use]
480    pub fn find_all_candidates_from(
481        &self,
482        other: &Self,
483        max_candidates: usize,
484        max_length_diff: usize,
485    ) -> Vec<(CanonicalId, Vec<CanonicalId>)> {
486        let sources: Vec<_> = other.entries.iter().collect();
487
488        sources
489            .par_iter()
490            .map(|(source_id, source_entry)| {
491                let candidates =
492                    self.find_candidates(source_id, source_entry, max_candidates, max_length_diff);
493                // Clone the inner CanonicalId from the Arc
494                ((*source_id).as_ref().clone(), candidates)
495            })
496            .collect::<Vec<_>>()
497    }
498
499    /// Get statistics about the index.
500    pub fn stats(&self) -> IndexStats {
501        let ecosystems = self.by_ecosystem.len();
502        let prefixes = self.by_prefix.len();
503        let trigrams = self.by_trigram.len();
504        let avg_per_ecosystem = if ecosystems > 0 {
505            self.by_ecosystem
506                .values()
507                .map(std::vec::Vec::len)
508                .sum::<usize>()
509                / ecosystems
510        } else {
511            0
512        };
513        let avg_per_prefix = if prefixes > 0 {
514            self.by_prefix
515                .values()
516                .map(std::vec::Vec::len)
517                .sum::<usize>()
518                / prefixes
519        } else {
520            0
521        };
522        let avg_per_trigram = if trigrams > 0 {
523            self.by_trigram
524                .values()
525                .map(std::vec::Vec::len)
526                .sum::<usize>()
527                / trigrams
528        } else {
529            0
530        };
531
532        IndexStats {
533            total_components: self.entries.len(),
534            ecosystems,
535            prefixes,
536            trigrams,
537            avg_per_ecosystem,
538            avg_per_prefix,
539            avg_per_trigram,
540        }
541    }
542
543    /// Compute trigram similarity between two entries (Jaccard coefficient).
544    ///
545    /// Returns a value between 0.0 and 1.0 where 1.0 means identical trigram sets.
546    #[must_use]
547    pub fn trigram_similarity(entry_a: &NormalizedEntry, entry_b: &NormalizedEntry) -> f64 {
548        if entry_a.trigrams.is_empty() || entry_b.trigrams.is_empty() {
549            return 0.0;
550        }
551
552        let set_a: HashSet<_> = entry_a.trigrams.iter().collect();
553        let set_b: HashSet<_> = entry_b.trigrams.iter().collect();
554
555        let intersection = set_a.intersection(&set_b).count();
556        let union = set_a.union(&set_b).count();
557
558        if union == 0 {
559            0.0
560        } else {
561            intersection as f64 / union as f64
562        }
563    }
564}
565
566/// Statistics about the component index.
567#[derive(Debug, Clone)]
568pub struct IndexStats {
569    /// Total number of indexed components
570    pub total_components: usize,
571    /// Number of unique ecosystems
572    pub ecosystems: usize,
573    /// Number of unique prefixes
574    pub prefixes: usize,
575    /// Number of unique trigrams
576    pub trigrams: usize,
577    /// Average components per ecosystem
578    pub avg_per_ecosystem: usize,
579    /// Average components per prefix
580    pub avg_per_prefix: usize,
581    /// Average components per trigram
582    pub avg_per_trigram: usize,
583}
584
585/// Batch candidate generator that combines multiple indexing strategies.
586///
587/// For best recall, combines:
588/// 1. `ComponentIndex` (ecosystem, prefix, trigram-based)
589/// 2. LSH index (for large SBOMs, catches approximate matches)
590/// 3. Cross-ecosystem mappings (optional)
591///
592/// The candidates from each source are deduplicated and merged.
593pub struct BatchCandidateGenerator {
594    /// Primary component index
595    component_index: ComponentIndex,
596    /// Optional LSH index for large SBOMs
597    lsh_index: Option<super::lsh::LshIndex>,
598    /// Optional cross-ecosystem database
599    cross_ecosystem_db: Option<super::cross_ecosystem::CrossEcosystemDb>,
600    /// Configuration
601    config: BatchCandidateConfig,
602}
603
604/// Configuration for batch candidate generation.
605#[derive(Debug, Clone)]
606pub struct BatchCandidateConfig {
607    /// Maximum candidates per source component
608    pub max_candidates: usize,
609    /// Maximum name length difference
610    pub max_length_diff: usize,
611    /// Minimum SBOM size to enable LSH (smaller SBOMs don't benefit)
612    pub lsh_threshold: usize,
613    /// Enable cross-ecosystem matching
614    pub enable_cross_ecosystem: bool,
615}
616
617impl Default for BatchCandidateConfig {
618    fn default() -> Self {
619        Self {
620            max_candidates: 100,
621            max_length_diff: 5,
622            lsh_threshold: 500, // Only use LSH for SBOMs with 500+ components
623            enable_cross_ecosystem: true,
624        }
625    }
626}
627
628/// Result of batch candidate generation.
629#[derive(Debug)]
630pub struct BatchCandidateResult {
631    /// Source component ID
632    pub source_id: CanonicalId,
633    /// Candidates from component index
634    pub index_candidates: Vec<CanonicalId>,
635    /// Additional candidates from LSH (not in `index_candidates`)
636    pub lsh_candidates: Vec<CanonicalId>,
637    /// Cross-ecosystem candidates (if different ecosystems)
638    pub cross_ecosystem_candidates: Vec<CanonicalId>,
639    /// Total unique candidates
640    pub total_unique: usize,
641}
642
643impl BatchCandidateGenerator {
644    /// Create a new batch candidate generator from an SBOM.
645    #[must_use]
646    pub fn build(sbom: &NormalizedSbom, config: BatchCandidateConfig) -> Self {
647        let component_index = ComponentIndex::build(sbom);
648
649        // Only build LSH index for large SBOMs
650        let lsh_index = if sbom.component_count() >= config.lsh_threshold {
651            Some(super::lsh::LshIndex::build(
652                sbom,
653                super::lsh::LshConfig::default(),
654            ))
655        } else {
656            None
657        };
658
659        // Optionally load cross-ecosystem database
660        let cross_ecosystem_db = if config.enable_cross_ecosystem {
661            Some(super::cross_ecosystem::CrossEcosystemDb::with_builtin_mappings())
662        } else {
663            None
664        };
665
666        Self {
667            component_index,
668            lsh_index,
669            cross_ecosystem_db,
670            config,
671        }
672    }
673
674    /// Generate candidates for a single component.
675    pub fn find_candidates(
676        &self,
677        source_id: &CanonicalId,
678        source_component: &Component,
679    ) -> BatchCandidateResult {
680        let mut seen: HashSet<CanonicalId> = HashSet::new();
681
682        // Get normalized entry for the source
683        let source_entry = self.component_index.get_entry(source_id).map_or_else(
684            || {
685                // Build entry on the fly if not in our index (source from different SBOM)
686                ComponentIndex::normalize_component(source_component)
687            },
688            NormalizedEntry::clone,
689        );
690
691        // 1. Component index candidates
692        let index_candidates = self.component_index.find_candidates(
693            source_id,
694            &source_entry,
695            self.config.max_candidates,
696            self.config.max_length_diff,
697        );
698        for id in &index_candidates {
699            seen.insert(id.clone());
700        }
701
702        // 2. LSH candidates (additional ones not found by component index)
703        let lsh_candidates: Vec<CanonicalId> =
704            self.lsh_index.as_ref().map_or_else(Vec::new, |lsh| {
705                let candidates: Vec<_> = lsh
706                    .find_candidates(source_component)
707                    .into_iter()
708                    .filter(|id| id != source_id && !seen.contains(id))
709                    .take(self.config.max_candidates / 2) // Limit LSH additions
710                    .collect();
711                for id in &candidates {
712                    seen.insert(id.clone());
713                }
714                candidates
715            });
716
717        // 3. Cross-ecosystem candidates
718        let cross_ecosystem_candidates: Vec<CanonicalId> = if let (Some(db), Some(eco)) =
719            (&self.cross_ecosystem_db, &source_component.ecosystem)
720        {
721            let candidates: Vec<_> = db
722                .find_equivalents(eco, &source_component.name)
723                .into_iter()
724                .flat_map(|m| {
725                    // Look up components with these names in our index
726                    let target_eco_str = m.target_ecosystem.to_string().to_lowercase();
727                    self.component_index
728                        .get_by_ecosystem(&target_eco_str)
729                        .unwrap_or_default()
730                })
731                .filter(|id| id != source_id && !seen.contains(id))
732                .take(self.config.max_candidates / 4) // Limit cross-ecosystem
733                .collect();
734            for id in &candidates {
735                seen.insert(id.clone());
736            }
737            candidates
738        } else {
739            Vec::new()
740        };
741
742        let total_unique = seen.len();
743
744        BatchCandidateResult {
745            source_id: source_id.clone(),
746            index_candidates,
747            lsh_candidates,
748            cross_ecosystem_candidates,
749            total_unique,
750        }
751    }
752
753    /// Generate candidates for multiple components in parallel.
754    #[must_use]
755    pub fn find_candidates_batch(
756        &self,
757        sources: &[(&CanonicalId, &Component)],
758    ) -> Vec<BatchCandidateResult> {
759        sources
760            .par_iter()
761            .map(|(id, comp)| self.find_candidates(id, comp))
762            .collect()
763    }
764
765    /// Get all unique candidates (deduplicated across all strategies).
766    #[must_use]
767    pub fn all_candidates(
768        &self,
769        source_id: &CanonicalId,
770        source_component: &Component,
771    ) -> Vec<CanonicalId> {
772        let result = self.find_candidates(source_id, source_component);
773        let mut all: Vec<_> = result.index_candidates;
774        all.extend(result.lsh_candidates);
775        all.extend(result.cross_ecosystem_candidates);
776        all
777    }
778
779    /// Get the underlying component index.
780    #[must_use]
781    pub const fn component_index(&self) -> &ComponentIndex {
782        &self.component_index
783    }
784
785    /// Check if LSH is enabled.
786    #[must_use]
787    pub const fn has_lsh(&self) -> bool {
788        self.lsh_index.is_some()
789    }
790
791    /// Check if cross-ecosystem matching is enabled.
792    #[must_use]
793    pub const fn has_cross_ecosystem(&self) -> bool {
794        self.cross_ecosystem_db.is_some()
795    }
796
797    /// Get statistics about the generator.
798    pub fn stats(&self) -> BatchCandidateStats {
799        BatchCandidateStats {
800            index_stats: self.component_index.stats(),
801            lsh_enabled: self.lsh_index.is_some(),
802            lsh_stats: self.lsh_index.as_ref().map(super::lsh::LshIndex::stats),
803            cross_ecosystem_enabled: self.cross_ecosystem_db.is_some(),
804        }
805    }
806}
807
808/// Statistics about the batch candidate generator.
809#[derive(Debug)]
810pub struct BatchCandidateStats {
811    /// Component index statistics
812    pub index_stats: IndexStats,
813    /// Whether LSH is enabled
814    pub lsh_enabled: bool,
815    /// LSH statistics (if enabled)
816    pub lsh_stats: Option<super::lsh::LshIndexStats>,
817    /// Whether cross-ecosystem matching is enabled
818    pub cross_ecosystem_enabled: bool,
819}
820
821/// A lazily-built component index that only constructs the index on first use.
822///
823/// This is useful when the index might not be needed (e.g., when doing simple
824/// exact-match only comparisons), or when construction should be deferred.
825pub struct LazyComponentIndex {
826    /// The SBOM to index (stored for deferred building)
827    sbom: Option<std::sync::Arc<NormalizedSbom>>,
828    /// The built index (populated on first access)
829    index: std::sync::OnceLock<ComponentIndex>,
830}
831
832impl LazyComponentIndex {
833    /// Create a new lazy index that will build from the given SBOM on first access.
834    #[must_use]
835    pub const fn new(sbom: std::sync::Arc<NormalizedSbom>) -> Self {
836        Self {
837            sbom: Some(sbom),
838            index: std::sync::OnceLock::new(),
839        }
840    }
841
842    /// Create a lazy index from an already-built `ComponentIndex`.
843    #[must_use]
844    pub fn from_index(index: ComponentIndex) -> Self {
845        let lazy = Self {
846            sbom: None,
847            index: std::sync::OnceLock::new(),
848        };
849        let _ = lazy.index.set(index);
850        lazy
851    }
852
853    /// Get the index, building it if necessary.
854    ///
855    /// This is safe to call from multiple threads - the index will only
856    /// be built once.
857    pub fn get(&self) -> &ComponentIndex {
858        self.index.get_or_init(|| {
859            self.sbom.as_ref().map_or_else(
860                || {
861                    // Empty index as fallback (shouldn't happen in normal use)
862                    ComponentIndex::build(&NormalizedSbom::default())
863                },
864                |sbom| ComponentIndex::build(sbom),
865            )
866        })
867    }
868
869    /// Check if the index has been built yet.
870    pub fn is_built(&self) -> bool {
871        self.index.get().is_some()
872    }
873
874    /// Get the index if already built, without triggering a build.
875    pub fn try_get(&self) -> Option<&ComponentIndex> {
876        self.index.get()
877    }
878}
879
880impl std::ops::Deref for LazyComponentIndex {
881    type Target = ComponentIndex;
882
883    fn deref(&self) -> &Self::Target {
884        self.get()
885    }
886}
887
888#[cfg(test)]
889mod tests {
890    use super::*;
891    use crate::model::{DocumentMetadata, Ecosystem};
892
893    fn make_component(name: &str, purl: Option<&str>) -> Component {
894        let mut comp = Component::new(name.to_string(), format!("test-{}", name));
895        comp.version = Some("1.0.0".to_string());
896        comp.identifiers.purl = purl.map(|s| s.to_string());
897        // Convert extracted ecosystem string to Ecosystem enum
898        comp.ecosystem = purl
899            .and_then(ComponentIndex::extract_ecosystem)
900            .map(|eco_str| Ecosystem::from_purl_type(&eco_str));
901        comp
902    }
903
904    #[test]
905    fn test_extract_ecosystem() {
906        assert_eq!(
907            ComponentIndex::extract_ecosystem("pkg:pypi/requests@2.28.0"),
908            Some("pypi".to_string())
909        );
910        assert_eq!(
911            ComponentIndex::extract_ecosystem("pkg:npm/@angular/core@14.0.0"),
912            Some("npm".to_string())
913        );
914        assert_eq!(
915            ComponentIndex::extract_ecosystem("pkg:cargo/serde@1.0.0"),
916            Some("cargo".to_string())
917        );
918    }
919
920    #[test]
921    fn test_normalize_name_pypi() {
922        assert_eq!(
923            ComponentIndex::normalize_name("Python_Dateutil", Some("pypi")),
924            "python-dateutil"
925        );
926        assert_eq!(
927            ComponentIndex::normalize_name("Some.Package", Some("pypi")),
928            "some-package"
929        );
930    }
931
932    #[test]
933    fn test_normalize_name_cargo() {
934        assert_eq!(
935            ComponentIndex::normalize_name("serde-json", Some("cargo")),
936            "serde_json"
937        );
938    }
939
940    #[test]
941    fn test_build_index() {
942        let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
943
944        let comp1 = make_component("requests", Some("pkg:pypi/requests@2.28.0"));
945        let comp2 = make_component("urllib3", Some("pkg:pypi/urllib3@1.26.0"));
946        let comp3 = make_component("serde", Some("pkg:cargo/serde@1.0.0"));
947
948        sbom.add_component(comp1);
949        sbom.add_component(comp2);
950        sbom.add_component(comp3);
951
952        let index = ComponentIndex::build(&sbom);
953
954        assert_eq!(index.len(), 3);
955        assert_eq!(index.by_ecosystem.get("pypi").map(|v| v.len()), Some(2));
956        assert_eq!(index.by_ecosystem.get("cargo").map(|v| v.len()), Some(1));
957    }
958
959    #[test]
960    fn test_find_candidates_same_ecosystem() {
961        let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
962
963        let comp1 = make_component("requests", Some("pkg:pypi/requests@2.28.0"));
964        let comp2 = make_component("urllib3", Some("pkg:pypi/urllib3@1.26.0"));
965        let comp3 = make_component("flask", Some("pkg:pypi/flask@2.0.0"));
966        let comp4 = make_component("serde", Some("pkg:cargo/serde@1.0.0"));
967
968        sbom.add_component(comp1.clone());
969        sbom.add_component(comp2);
970        sbom.add_component(comp3);
971        sbom.add_component(comp4);
972
973        let index = ComponentIndex::build(&sbom);
974
975        // Get the ID for requests
976        let requests_id = sbom
977            .components
978            .keys()
979            .find(|id| {
980                sbom.components
981                    .get(*id)
982                    .map(|c| c.name == "requests")
983                    .unwrap_or(false)
984            })
985            .unwrap();
986
987        let entry = index.get_entry(requests_id).unwrap();
988        let candidates = index.find_candidates(requests_id, entry, 10, 5);
989
990        // Should find pypi packages, not cargo packages
991        assert!(candidates.len() >= 2);
992        for cand_id in &candidates {
993            let cand_entry = index.get_entry(cand_id).unwrap();
994            assert_eq!(cand_entry.ecosystem, Some("pypi".to_string()));
995        }
996    }
997
998    #[test]
999    fn test_compute_trigrams() {
1000        // Normal case
1001        let trigrams = ComponentIndex::compute_trigrams("lodash");
1002        assert_eq!(trigrams, vec!["lod", "oda", "das", "ash"]);
1003
1004        // Short name (< 3 chars)
1005        let trigrams = ComponentIndex::compute_trigrams("ab");
1006        assert_eq!(trigrams, vec!["ab"]);
1007
1008        // Empty name
1009        let trigrams = ComponentIndex::compute_trigrams("");
1010        assert!(trigrams.is_empty());
1011
1012        // Exactly 3 chars
1013        let trigrams = ComponentIndex::compute_trigrams("abc");
1014        assert_eq!(trigrams, vec!["abc"]);
1015    }
1016
1017    #[test]
1018    fn test_trigram_similarity() {
1019        let entry_a = NormalizedEntry {
1020            normalized_purl: None,
1021            normalized_name: "lodash".to_string(),
1022            name_length: 6,
1023            ecosystem: None,
1024            prefix: "lod".to_string(),
1025            trigrams: vec![
1026                "lod".to_string(),
1027                "oda".to_string(),
1028                "das".to_string(),
1029                "ash".to_string(),
1030            ],
1031        };
1032
1033        let entry_b = NormalizedEntry {
1034            normalized_purl: None,
1035            normalized_name: "lodash-es".to_string(),
1036            name_length: 9,
1037            ecosystem: None,
1038            prefix: "lod".to_string(),
1039            trigrams: vec![
1040                "lod".to_string(),
1041                "oda".to_string(),
1042                "das".to_string(),
1043                "ash".to_string(),
1044                "sh-".to_string(),
1045                "h-e".to_string(),
1046                "-es".to_string(),
1047            ],
1048        };
1049
1050        let similarity = ComponentIndex::trigram_similarity(&entry_a, &entry_b);
1051        // lodash has 4 trigrams, lodash-es has 7, they share 4
1052        // Jaccard = 4 / 7 ≈ 0.57
1053        assert!(
1054            similarity > 0.5 && similarity < 0.6,
1055            "Expected ~0.57, got {}",
1056            similarity
1057        );
1058
1059        // Identical entries should have similarity 1.0
1060        let same_similarity = ComponentIndex::trigram_similarity(&entry_a, &entry_a);
1061        assert!((same_similarity - 1.0).abs() < f64::EPSILON);
1062
1063        // Completely different entries should have low similarity
1064        let entry_c = NormalizedEntry {
1065            normalized_purl: None,
1066            normalized_name: "react".to_string(),
1067            name_length: 5,
1068            ecosystem: None,
1069            prefix: "rea".to_string(),
1070            trigrams: vec!["rea".to_string(), "eac".to_string(), "act".to_string()],
1071        };
1072
1073        let diff_similarity = ComponentIndex::trigram_similarity(&entry_a, &entry_c);
1074        assert!(
1075            diff_similarity < 0.1,
1076            "Expected low similarity, got {}",
1077            diff_similarity
1078        );
1079    }
1080
1081    #[test]
1082    fn test_trigram_index_find_similar_suffix() {
1083        // Test that trigram indexing can find packages with different prefixes but similar content
1084        let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
1085
1086        // These packages share trigrams in the middle/end
1087        let comp1 = make_component("react-dom", Some("pkg:npm/react-dom@18.0.0"));
1088        let comp2 = make_component("preact-dom", Some("pkg:npm/preact-dom@10.0.0")); // shares "act", "-do", "dom"
1089        let comp3 = make_component("angular", Some("pkg:npm/angular@15.0.0")); // completely different
1090
1091        sbom.add_component(comp1.clone());
1092        sbom.add_component(comp2);
1093        sbom.add_component(comp3);
1094
1095        let index = ComponentIndex::build(&sbom);
1096
1097        // Find ID for react-dom
1098        let react_id = sbom
1099            .components
1100            .keys()
1101            .find(|id| {
1102                sbom.components
1103                    .get(*id)
1104                    .map(|c| c.name == "react-dom")
1105                    .unwrap_or(false)
1106            })
1107            .unwrap();
1108
1109        let entry = index.get_entry(react_id).unwrap();
1110
1111        // Should find preact-dom via trigram matching even though prefix differs
1112        let candidates = index.find_candidates(react_id, entry, 10, 5);
1113
1114        let preact_found = candidates.iter().any(|id| {
1115            index
1116                .get_entry(id)
1117                .map(|e| e.normalized_name.contains("preact"))
1118                .unwrap_or(false)
1119        });
1120
1121        assert!(preact_found, "Should find preact-dom via trigram matching");
1122    }
1123}