sbom_tools/matching/
index.rs

1//! Component index for efficient matching.
2//!
3//! This module provides indexing structures to reduce O(n²) fuzzy comparisons
4//! by pre-normalizing and bucketing components for efficient candidate lookup.
5
6use crate::model::{CanonicalId, Component, NormalizedSbom};
7use rayon::prelude::*;
8use std::collections::{HashMap, HashSet};
9use std::sync::Arc;
10
11/// Pre-computed normalized data for a component.
12#[derive(Debug, Clone)]
13pub struct NormalizedEntry {
14    /// Normalized PURL (if available)
15    pub normalized_purl: Option<String>,
16    /// Normalized component name (lowercase, separators normalized)
17    pub normalized_name: String,
18    /// Length of the normalized name (for length-based filtering)
19    pub name_length: usize,
20    /// Ecosystem extracted from PURL or inferred
21    pub ecosystem: Option<String>,
22    /// First 3 characters of normalized name (for prefix bucketing)
23    pub prefix: String,
24    /// Trigrams (3-character substrings) for fuzzy matching
25    pub trigrams: Vec<String>,
26}
27
28/// Index for efficient component candidate lookup.
29///
30/// Reduces the O(n·m) comparison to O(n·k) where k << m by:
31/// 1. Grouping components by ecosystem
32/// 2. Bucketing by name prefix
33/// 3. Bucketing by trigrams (3-char substrings) for fuzzy matching
34/// 4. Pre-normalizing names for fast comparison
35///
36/// Uses `Arc<CanonicalId>` internally for efficient cloning during index building.
37pub struct ComponentIndex {
38    /// Ecosystem -> list of component IDs in that ecosystem
39    by_ecosystem: HashMap<String, Vec<Arc<CanonicalId>>>,
40    /// Normalized name prefix (first 3 chars) -> component IDs
41    by_prefix: HashMap<String, Vec<Arc<CanonicalId>>>,
42    /// Trigram -> list of component IDs containing that trigram
43    by_trigram: HashMap<String, Vec<Arc<CanonicalId>>>,
44    /// Pre-computed normalized data for each component
45    entries: HashMap<Arc<CanonicalId>, NormalizedEntry>,
46    /// All component IDs (for fallback)
47    all_ids: Vec<Arc<CanonicalId>>,
48}
49
50impl ComponentIndex {
51    /// Build an index from an SBOM.
52    ///
53    /// Uses `Arc<CanonicalId>` internally to avoid expensive cloning of IDs
54    /// across multiple index structures.
55    #[must_use]
56    pub fn build(sbom: &NormalizedSbom) -> Self {
57        let mut by_ecosystem: HashMap<String, Vec<Arc<CanonicalId>>> = HashMap::new();
58        let mut by_prefix: HashMap<String, Vec<Arc<CanonicalId>>> = HashMap::new();
59        let mut by_trigram: HashMap<String, Vec<Arc<CanonicalId>>> = HashMap::new();
60        let mut entries: HashMap<Arc<CanonicalId>, NormalizedEntry> = HashMap::new();
61        let mut all_ids: Vec<Arc<CanonicalId>> = Vec::new();
62
63        for (id, comp) in &sbom.components {
64            let entry = Self::normalize_component(comp);
65            // Wrap ID in Arc once - all subsequent "clones" are cheap reference count increments
66            let arc_id = Arc::new(id.clone());
67
68            // Index by ecosystem
69            if let Some(ref eco) = entry.ecosystem {
70                by_ecosystem
71                    .entry(eco.clone())
72                    .or_default()
73                    .push(Arc::clone(&arc_id));
74            }
75
76            // Index by name prefix
77            if !entry.prefix.is_empty() {
78                by_prefix
79                    .entry(entry.prefix.clone())
80                    .or_default()
81                    .push(Arc::clone(&arc_id));
82            }
83
84            // Index by trigrams
85            for trigram in &entry.trigrams {
86                by_trigram
87                    .entry(trigram.clone())
88                    .or_default()
89                    .push(Arc::clone(&arc_id));
90            }
91
92            entries.insert(Arc::clone(&arc_id), entry);
93            all_ids.push(arc_id);
94        }
95
96        Self {
97            by_ecosystem,
98            by_prefix,
99            by_trigram,
100            entries,
101            all_ids,
102        }
103    }
104
105    /// Normalize a component for indexing.
106    #[must_use]
107    pub fn normalize_component(comp: &Component) -> NormalizedEntry {
108        // Extract ecosystem from PURL
109        let (ecosystem, normalized_purl) = comp.identifiers.purl.as_ref().map_or_else(
110            || {
111                // Try to infer ecosystem from component type or other fields
112                // Convert Ecosystem enum to String for consistent comparison
113                (
114                    comp.ecosystem
115                        .as_ref()
116                        .map(std::string::ToString::to_string),
117                    None,
118                )
119            },
120            |purl| {
121                let eco = Self::extract_ecosystem(purl);
122                let normalized = Self::normalize_purl(purl);
123                (eco, Some(normalized))
124            },
125        );
126
127        // Normalize name
128        let normalized_name = Self::normalize_name(&comp.name, ecosystem.as_deref());
129        let name_length = normalized_name.len();
130        let prefix = normalized_name.chars().take(3).collect::<String>();
131        let trigrams = Self::compute_trigrams(&normalized_name);
132
133        NormalizedEntry {
134            normalized_purl,
135            normalized_name,
136            name_length,
137            ecosystem,
138            prefix,
139            trigrams,
140        }
141    }
142
143    /// Compute trigrams (3-character substrings) for a normalized name.
144    ///
145    /// Trigrams enable finding matches where only the middle or end differs,
146    /// which prefix-based indexing would miss.
147    fn compute_trigrams(name: &str) -> Vec<String> {
148        if name.len() < 3 {
149            // For very short names, use the name itself as a "trigram"
150            return if name.is_empty() {
151                vec![]
152            } else {
153                vec![name.to_string()]
154            };
155        }
156
157        // Fast path: ASCII-only names (common for package names)
158        // Avoids intermediate Vec<char> allocation
159        if name.is_ascii() {
160            return name
161                .as_bytes()
162                .windows(3)
163                .map(|w| {
164                    // SAFETY: name.is_ascii() was checked above, so all bytes are valid
165                    // single-byte UTF-8 characters. Any 3-byte window is valid UTF-8.
166                    unsafe { std::str::from_utf8_unchecked(w) }.to_string()
167                })
168                .collect();
169        }
170
171        // Slow path: Unicode names - need to collect chars first for windows()
172        let chars: Vec<char> = name.chars().collect();
173        if chars.len() < 3 {
174            return vec![name.to_string()];
175        }
176
177        chars
178            .windows(3)
179            .map(|w| w.iter().collect::<String>())
180            .collect()
181    }
182
183    /// Extract ecosystem from a PURL.
184    fn extract_ecosystem(purl: &str) -> Option<String> {
185        // PURL format: pkg:ecosystem/namespace/name@version
186        if let Some(rest) = purl.strip_prefix("pkg:")
187            && let Some(slash_pos) = rest.find('/')
188        {
189            return Some(rest[..slash_pos].to_lowercase());
190        }
191        None
192    }
193
194    /// Normalize a PURL for comparison.
195    fn normalize_purl(purl: &str) -> String {
196        // Basic normalization: lowercase and strip version qualifiers
197        let purl_lower = purl.to_lowercase();
198        // Remove version part for comparison if present
199        if let Some(at_pos) = purl_lower.rfind('@') {
200            purl_lower[..at_pos].to_string()
201        } else {
202            purl_lower
203        }
204    }
205
206    /// Normalize a component name for comparison.
207    ///
208    /// Applies ecosystem-specific normalization rules:
209    /// - `PyPI`: underscores, hyphens, dots are all equivalent (converted to hyphen)
210    /// - Cargo: hyphens and underscores are equivalent (converted to underscore)
211    /// - npm: lowercase only, preserves scope
212    /// - Default: lowercase with underscore to hyphen conversion
213    ///
214    /// This is also used by LSH for consistent shingle computation.
215    #[must_use]
216    pub fn normalize_name(name: &str, ecosystem: Option<&str>) -> String {
217        let mut normalized = name.to_lowercase();
218
219        // Apply ecosystem-specific normalization
220        match ecosystem {
221            Some("pypi") => {
222                // Python: underscores, hyphens, dots are equivalent
223                normalized = normalized.replace(['_', '.'], "-");
224            }
225            Some("cargo") => {
226                // Rust: hyphens and underscores are equivalent
227                normalized = normalized.replace('-', "_");
228            }
229            Some("npm") => {
230                // npm: already lowercase, preserve scope
231                // Nothing special needed
232            }
233            _ => {
234                // Default: just lowercase, normalize common separators
235                normalized = normalized.replace('_', "-");
236            }
237        }
238
239        // Collapse multiple separators
240        while normalized.contains("--") {
241            normalized = normalized.replace("--", "-");
242        }
243
244        normalized
245    }
246
247    /// Get normalized entry for a component.
248    #[must_use]
249    pub fn get_entry(&self, id: &CanonicalId) -> Option<&NormalizedEntry> {
250        // Arc<T>: Borrow<T> allows HashMap lookup with &CanonicalId
251        self.entries.get(id)
252    }
253
254    /// Get components by ecosystem.
255    ///
256    /// Returns cloned `CanonicalIds` for API stability. The internal storage uses Arc
257    /// to avoid expensive cloning during index building.
258    #[must_use]
259    pub fn get_by_ecosystem(&self, ecosystem: &str) -> Option<Vec<CanonicalId>> {
260        self.by_ecosystem
261            .get(ecosystem)
262            .map(|v| v.iter().map(|arc| (**arc).clone()).collect())
263    }
264
265    /// Find candidate matches for a component.
266    ///
267    /// Returns a list of component IDs that are likely matches, ordered by likelihood.
268    /// Uses ecosystem and prefix-based filtering to reduce candidates.
269    ///
270    /// Returns cloned `CanonicalIds` for API stability. The internal storage uses Arc
271    /// to avoid expensive cloning during index building.
272    #[must_use]
273    pub fn find_candidates(
274        &self,
275        source_id: &CanonicalId,
276        source_entry: &NormalizedEntry,
277        max_candidates: usize,
278        max_length_diff: usize,
279    ) -> Vec<CanonicalId> {
280        let mut candidates: Vec<Arc<CanonicalId>> = Vec::new();
281        let mut seen: HashSet<Arc<CanonicalId>> = HashSet::new();
282
283        // Priority 1: Same ecosystem candidates
284        if let Some(ref eco) = source_entry.ecosystem
285            && let Some(ids) = self.by_ecosystem.get(eco)
286        {
287            for id in ids {
288                if id.as_ref() != source_id && !seen.contains(id) {
289                    // Apply length filter
290                    if let Some(entry) = self.entries.get(id.as_ref()) {
291                        let len_diff = (source_entry.name_length as i32 - entry.name_length as i32)
292                            .unsigned_abs() as usize;
293                        if len_diff <= max_length_diff {
294                            candidates.push(Arc::clone(id));
295                            seen.insert(Arc::clone(id));
296                        }
297                    }
298                }
299            }
300        }
301
302        // Priority 2: Same prefix candidates (cross-ecosystem fallback)
303        if candidates.len() < max_candidates
304            && !source_entry.prefix.is_empty()
305            && let Some(ids) = self.by_prefix.get(&source_entry.prefix)
306        {
307            for id in ids {
308                if id.as_ref() != source_id
309                    && !seen.contains(id)
310                    && let Some(entry) = self.entries.get(id.as_ref())
311                {
312                    let len_diff = (source_entry.name_length as i32 - entry.name_length as i32)
313                        .unsigned_abs() as usize;
314                    if len_diff <= max_length_diff {
315                        candidates.push(Arc::clone(id));
316                        seen.insert(Arc::clone(id));
317                    }
318                }
319                if candidates.len() >= max_candidates {
320                    break;
321                }
322            }
323        }
324
325        // Priority 3: Similar prefixes (1-char difference in prefix)
326        if candidates.len() < max_candidates && source_entry.prefix.len() >= 2 {
327            let prefix_2 = &source_entry.prefix[..2.min(source_entry.prefix.len())];
328            for (prefix, ids) in &self.by_prefix {
329                if prefix.starts_with(prefix_2) && prefix != &source_entry.prefix {
330                    for id in ids {
331                        if id.as_ref() != source_id
332                            && !seen.contains(id)
333                            && let Some(entry) = self.entries.get(id.as_ref())
334                        {
335                            let len_diff = (source_entry.name_length as i32
336                                - entry.name_length as i32)
337                                .unsigned_abs() as usize;
338                            if len_diff <= max_length_diff {
339                                candidates.push(Arc::clone(id));
340                                seen.insert(Arc::clone(id));
341                            }
342                        }
343                        if candidates.len() >= max_candidates {
344                            break;
345                        }
346                    }
347                }
348                if candidates.len() >= max_candidates {
349                    break;
350                }
351            }
352        }
353
354        // Priority 4: Trigram-based matching (catches middle/end differences)
355        // Find components that share multiple trigrams with the source
356        if candidates.len() < max_candidates && !source_entry.trigrams.is_empty() {
357            // Count trigram overlap for each candidate
358            let mut trigram_scores: HashMap<Arc<CanonicalId>, usize> = HashMap::new();
359
360            for trigram in &source_entry.trigrams {
361                if let Some(ids) = self.by_trigram.get(trigram) {
362                    for id in ids {
363                        if id.as_ref() != source_id && !seen.contains(id) {
364                            *trigram_scores.entry(Arc::clone(id)).or_default() += 1;
365                        }
366                    }
367                }
368            }
369
370            // Require at least 2 shared trigrams (or 1 for very short names)
371            let min_shared = if source_entry.trigrams.len() <= 2 {
372                1
373            } else {
374                2
375            };
376
377            // Sort by trigram overlap count (descending)
378            let mut scored: Vec<_> = trigram_scores
379                .into_iter()
380                .filter(|(_, count)| *count >= min_shared)
381                .collect();
382            scored.sort_by(|a, b| b.1.cmp(&a.1));
383
384            for (id, _score) in scored {
385                if candidates.len() >= max_candidates {
386                    break;
387                }
388                if let Some(entry) = self.entries.get(id.as_ref()) {
389                    let len_diff = (source_entry.name_length as i32 - entry.name_length as i32)
390                        .unsigned_abs() as usize;
391                    if len_diff <= max_length_diff {
392                        candidates.push(Arc::clone(&id));
393                        seen.insert(id);
394                    }
395                }
396            }
397        }
398
399        // Truncate to max_candidates and convert to owned CanonicalIds
400        candidates.truncate(max_candidates);
401        candidates.into_iter().map(|arc| (*arc).clone()).collect()
402    }
403
404    /// Get all component IDs (for fallback full scan).
405    ///
406    /// Returns cloned `CanonicalIds` for API stability.
407    #[must_use]
408    pub fn all_ids(&self) -> Vec<CanonicalId> {
409        self.all_ids.iter().map(|arc| (**arc).clone()).collect()
410    }
411
412    /// Get the number of indexed components.
413    #[must_use]
414    pub fn len(&self) -> usize {
415        self.entries.len()
416    }
417
418    /// Check if the index is empty.
419    #[must_use]
420    pub fn is_empty(&self) -> bool {
421        self.entries.is_empty()
422    }
423
424    /// Find candidates for multiple source components in parallel.
425    ///
426    /// This is significantly faster than calling `find_candidates` sequentially
427    /// for large SBOMs (1000+ components). Uses rayon for parallel iteration.
428    ///
429    /// Returns a vector of (`source_id`, candidates) pairs in the same order as input.
430    #[must_use]
431    pub fn find_candidates_parallel<'a>(
432        &self,
433        sources: &[(&'a CanonicalId, &NormalizedEntry)],
434        max_candidates: usize,
435        max_length_diff: usize,
436    ) -> Vec<(&'a CanonicalId, Vec<CanonicalId>)> {
437        sources
438            .par_iter()
439            .map(|(source_id, source_entry)| {
440                let candidates =
441                    self.find_candidates(source_id, source_entry, max_candidates, max_length_diff);
442                (*source_id, candidates)
443            })
444            .collect()
445    }
446
447    /// Find candidates for all components in another index in parallel.
448    ///
449    /// Useful for diffing two SBOMs: build an index from the new SBOM,
450    /// then find candidates for all components from the old SBOM.
451    #[must_use]
452    pub fn find_all_candidates_from(
453        &self,
454        other: &Self,
455        max_candidates: usize,
456        max_length_diff: usize,
457    ) -> Vec<(CanonicalId, Vec<CanonicalId>)> {
458        let sources: Vec<_> = other.entries.iter().collect();
459
460        sources
461            .par_iter()
462            .map(|(source_id, source_entry)| {
463                let candidates =
464                    self.find_candidates(source_id, source_entry, max_candidates, max_length_diff);
465                // Clone the inner CanonicalId from the Arc
466                ((*source_id).as_ref().clone(), candidates)
467            })
468            .collect::<Vec<_>>()
469    }
470
471    /// Get statistics about the index.
472    pub fn stats(&self) -> IndexStats {
473        let ecosystems = self.by_ecosystem.len();
474        let prefixes = self.by_prefix.len();
475        let trigrams = self.by_trigram.len();
476        let avg_per_ecosystem = if ecosystems > 0 {
477            self.by_ecosystem
478                .values()
479                .map(std::vec::Vec::len)
480                .sum::<usize>()
481                / ecosystems
482        } else {
483            0
484        };
485        let avg_per_prefix = if prefixes > 0 {
486            self.by_prefix
487                .values()
488                .map(std::vec::Vec::len)
489                .sum::<usize>()
490                / prefixes
491        } else {
492            0
493        };
494        let avg_per_trigram = if trigrams > 0 {
495            self.by_trigram
496                .values()
497                .map(std::vec::Vec::len)
498                .sum::<usize>()
499                / trigrams
500        } else {
501            0
502        };
503
504        IndexStats {
505            total_components: self.entries.len(),
506            ecosystems,
507            prefixes,
508            trigrams,
509            avg_per_ecosystem,
510            avg_per_prefix,
511            avg_per_trigram,
512        }
513    }
514
515    /// Compute trigram similarity between two entries (Jaccard coefficient).
516    ///
517    /// Returns a value between 0.0 and 1.0 where 1.0 means identical trigram sets.
518    #[must_use]
519    pub fn trigram_similarity(entry_a: &NormalizedEntry, entry_b: &NormalizedEntry) -> f64 {
520        if entry_a.trigrams.is_empty() || entry_b.trigrams.is_empty() {
521            return 0.0;
522        }
523
524        let set_a: HashSet<_> = entry_a.trigrams.iter().collect();
525        let set_b: HashSet<_> = entry_b.trigrams.iter().collect();
526
527        let intersection = set_a.intersection(&set_b).count();
528        let union = set_a.union(&set_b).count();
529
530        if union == 0 {
531            0.0
532        } else {
533            intersection as f64 / union as f64
534        }
535    }
536}
537
538/// Statistics about the component index.
539#[derive(Debug, Clone)]
540pub struct IndexStats {
541    /// Total number of indexed components
542    pub total_components: usize,
543    /// Number of unique ecosystems
544    pub ecosystems: usize,
545    /// Number of unique prefixes
546    pub prefixes: usize,
547    /// Number of unique trigrams
548    pub trigrams: usize,
549    /// Average components per ecosystem
550    pub avg_per_ecosystem: usize,
551    /// Average components per prefix
552    pub avg_per_prefix: usize,
553    /// Average components per trigram
554    pub avg_per_trigram: usize,
555}
556
557/// Batch candidate generator that combines multiple indexing strategies.
558///
559/// For best recall, combines:
560/// 1. `ComponentIndex` (ecosystem, prefix, trigram-based)
561/// 2. LSH index (for large SBOMs, catches approximate matches)
562/// 3. Cross-ecosystem mappings (optional)
563///
564/// The candidates from each source are deduplicated and merged.
565pub struct BatchCandidateGenerator {
566    /// Primary component index
567    component_index: ComponentIndex,
568    /// Optional LSH index for large SBOMs
569    lsh_index: Option<super::lsh::LshIndex>,
570    /// Optional cross-ecosystem database
571    cross_ecosystem_db: Option<super::cross_ecosystem::CrossEcosystemDb>,
572    /// Configuration
573    config: BatchCandidateConfig,
574}
575
576/// Configuration for batch candidate generation.
577#[derive(Debug, Clone)]
578pub struct BatchCandidateConfig {
579    /// Maximum candidates per source component
580    pub max_candidates: usize,
581    /// Maximum name length difference
582    pub max_length_diff: usize,
583    /// Minimum SBOM size to enable LSH (smaller SBOMs don't benefit)
584    pub lsh_threshold: usize,
585    /// Enable cross-ecosystem matching
586    pub enable_cross_ecosystem: bool,
587}
588
589impl Default for BatchCandidateConfig {
590    fn default() -> Self {
591        Self {
592            max_candidates: 100,
593            max_length_diff: 5,
594            lsh_threshold: 500, // Only use LSH for SBOMs with 500+ components
595            enable_cross_ecosystem: true,
596        }
597    }
598}
599
600/// Result of batch candidate generation.
601#[derive(Debug)]
602pub struct BatchCandidateResult {
603    /// Source component ID
604    pub source_id: CanonicalId,
605    /// Candidates from component index
606    pub index_candidates: Vec<CanonicalId>,
607    /// Additional candidates from LSH (not in `index_candidates`)
608    pub lsh_candidates: Vec<CanonicalId>,
609    /// Cross-ecosystem candidates (if different ecosystems)
610    pub cross_ecosystem_candidates: Vec<CanonicalId>,
611    /// Total unique candidates
612    pub total_unique: usize,
613}
614
615impl BatchCandidateGenerator {
616    /// Create a new batch candidate generator from an SBOM.
617    #[must_use]
618    pub fn build(sbom: &NormalizedSbom, config: BatchCandidateConfig) -> Self {
619        let component_index = ComponentIndex::build(sbom);
620
621        // Only build LSH index for large SBOMs
622        let lsh_index = if sbom.component_count() >= config.lsh_threshold {
623            Some(super::lsh::LshIndex::build(
624                sbom,
625                super::lsh::LshConfig::default(),
626            ))
627        } else {
628            None
629        };
630
631        // Optionally load cross-ecosystem database
632        let cross_ecosystem_db = if config.enable_cross_ecosystem {
633            Some(super::cross_ecosystem::CrossEcosystemDb::with_builtin_mappings())
634        } else {
635            None
636        };
637
638        Self {
639            component_index,
640            lsh_index,
641            cross_ecosystem_db,
642            config,
643        }
644    }
645
646    /// Generate candidates for a single component.
647    pub fn find_candidates(
648        &self,
649        source_id: &CanonicalId,
650        source_component: &Component,
651    ) -> BatchCandidateResult {
652        let mut seen: HashSet<CanonicalId> = HashSet::new();
653
654        // Get normalized entry for the source
655        let source_entry = self.component_index.get_entry(source_id).map_or_else(
656            || {
657                // Build entry on the fly if not in our index (source from different SBOM)
658                ComponentIndex::normalize_component(source_component)
659            },
660            NormalizedEntry::clone,
661        );
662
663        // 1. Component index candidates
664        let index_candidates = self.component_index.find_candidates(
665            source_id,
666            &source_entry,
667            self.config.max_candidates,
668            self.config.max_length_diff,
669        );
670        for id in &index_candidates {
671            seen.insert(id.clone());
672        }
673
674        // 2. LSH candidates (additional ones not found by component index)
675        let lsh_candidates: Vec<CanonicalId> =
676            self.lsh_index.as_ref().map_or_else(Vec::new, |lsh| {
677                let candidates: Vec<_> = lsh
678                    .find_candidates(source_component)
679                    .into_iter()
680                    .filter(|id| id != source_id && !seen.contains(id))
681                    .take(self.config.max_candidates / 2) // Limit LSH additions
682                    .collect();
683                for id in &candidates {
684                    seen.insert(id.clone());
685                }
686                candidates
687            });
688
689        // 3. Cross-ecosystem candidates
690        let cross_ecosystem_candidates: Vec<CanonicalId> = if let (Some(db), Some(eco)) =
691            (&self.cross_ecosystem_db, &source_component.ecosystem)
692        {
693            let candidates: Vec<_> = db
694                .find_equivalents(eco, &source_component.name)
695                .into_iter()
696                .flat_map(|m| {
697                    // Look up components with these names in our index
698                    let target_eco_str = m.target_ecosystem.to_string().to_lowercase();
699                    self.component_index
700                        .get_by_ecosystem(&target_eco_str)
701                        .unwrap_or_default()
702                })
703                .filter(|id| id != source_id && !seen.contains(id))
704                .take(self.config.max_candidates / 4) // Limit cross-ecosystem
705                .collect();
706            for id in &candidates {
707                seen.insert(id.clone());
708            }
709            candidates
710        } else {
711            Vec::new()
712        };
713
714        let total_unique = seen.len();
715
716        BatchCandidateResult {
717            source_id: source_id.clone(),
718            index_candidates,
719            lsh_candidates,
720            cross_ecosystem_candidates,
721            total_unique,
722        }
723    }
724
725    /// Generate candidates for multiple components in parallel.
726    #[must_use]
727    pub fn find_candidates_batch(
728        &self,
729        sources: &[(&CanonicalId, &Component)],
730    ) -> Vec<BatchCandidateResult> {
731        sources
732            .par_iter()
733            .map(|(id, comp)| self.find_candidates(id, comp))
734            .collect()
735    }
736
737    /// Get all unique candidates (deduplicated across all strategies).
738    #[must_use]
739    pub fn all_candidates(
740        &self,
741        source_id: &CanonicalId,
742        source_component: &Component,
743    ) -> Vec<CanonicalId> {
744        let result = self.find_candidates(source_id, source_component);
745        let mut all: Vec<_> = result.index_candidates;
746        all.extend(result.lsh_candidates);
747        all.extend(result.cross_ecosystem_candidates);
748        all
749    }
750
751    /// Get the underlying component index.
752    #[must_use]
753    pub const fn component_index(&self) -> &ComponentIndex {
754        &self.component_index
755    }
756
757    /// Check if LSH is enabled.
758    #[must_use]
759    pub const fn has_lsh(&self) -> bool {
760        self.lsh_index.is_some()
761    }
762
763    /// Check if cross-ecosystem matching is enabled.
764    #[must_use]
765    pub const fn has_cross_ecosystem(&self) -> bool {
766        self.cross_ecosystem_db.is_some()
767    }
768
769    /// Get statistics about the generator.
770    pub fn stats(&self) -> BatchCandidateStats {
771        BatchCandidateStats {
772            index_stats: self.component_index.stats(),
773            lsh_enabled: self.lsh_index.is_some(),
774            lsh_stats: self.lsh_index.as_ref().map(super::lsh::LshIndex::stats),
775            cross_ecosystem_enabled: self.cross_ecosystem_db.is_some(),
776        }
777    }
778}
779
780/// Statistics about the batch candidate generator.
781#[derive(Debug)]
782pub struct BatchCandidateStats {
783    /// Component index statistics
784    pub index_stats: IndexStats,
785    /// Whether LSH is enabled
786    pub lsh_enabled: bool,
787    /// LSH statistics (if enabled)
788    pub lsh_stats: Option<super::lsh::LshIndexStats>,
789    /// Whether cross-ecosystem matching is enabled
790    pub cross_ecosystem_enabled: bool,
791}
792
793/// A lazily-built component index that only constructs the index on first use.
794///
795/// This is useful when the index might not be needed (e.g., when doing simple
796/// exact-match only comparisons), or when construction should be deferred.
797pub struct LazyComponentIndex {
798    /// The SBOM to index (stored for deferred building)
799    sbom: Option<std::sync::Arc<NormalizedSbom>>,
800    /// The built index (populated on first access)
801    index: std::sync::OnceLock<ComponentIndex>,
802}
803
804impl LazyComponentIndex {
805    /// Create a new lazy index that will build from the given SBOM on first access.
806    #[must_use]
807    pub const fn new(sbom: std::sync::Arc<NormalizedSbom>) -> Self {
808        Self {
809            sbom: Some(sbom),
810            index: std::sync::OnceLock::new(),
811        }
812    }
813
814    /// Create a lazy index from an already-built `ComponentIndex`.
815    #[must_use]
816    pub fn from_index(index: ComponentIndex) -> Self {
817        let lazy = Self {
818            sbom: None,
819            index: std::sync::OnceLock::new(),
820        };
821        let _ = lazy.index.set(index);
822        lazy
823    }
824
825    /// Get the index, building it if necessary.
826    ///
827    /// This is safe to call from multiple threads - the index will only
828    /// be built once.
829    pub fn get(&self) -> &ComponentIndex {
830        self.index.get_or_init(|| {
831            self.sbom.as_ref().map_or_else(
832                || {
833                    // Empty index as fallback (shouldn't happen in normal use)
834                    ComponentIndex::build(&NormalizedSbom::default())
835                },
836                |sbom| ComponentIndex::build(sbom),
837            )
838        })
839    }
840
841    /// Check if the index has been built yet.
842    pub fn is_built(&self) -> bool {
843        self.index.get().is_some()
844    }
845
846    /// Get the index if already built, without triggering a build.
847    pub fn try_get(&self) -> Option<&ComponentIndex> {
848        self.index.get()
849    }
850}
851
852impl std::ops::Deref for LazyComponentIndex {
853    type Target = ComponentIndex;
854
855    fn deref(&self) -> &Self::Target {
856        self.get()
857    }
858}
859
860#[cfg(test)]
861mod tests {
862    use super::*;
863    use crate::model::{DocumentMetadata, Ecosystem};
864
865    fn make_component(name: &str, purl: Option<&str>) -> Component {
866        let mut comp = Component::new(name.to_string(), format!("test-{}", name));
867        comp.version = Some("1.0.0".to_string());
868        comp.identifiers.purl = purl.map(|s| s.to_string());
869        // Convert extracted ecosystem string to Ecosystem enum
870        comp.ecosystem = purl
871            .and_then(ComponentIndex::extract_ecosystem)
872            .map(|eco_str| Ecosystem::from_purl_type(&eco_str));
873        comp
874    }
875
876    #[test]
877    fn test_extract_ecosystem() {
878        assert_eq!(
879            ComponentIndex::extract_ecosystem("pkg:pypi/requests@2.28.0"),
880            Some("pypi".to_string())
881        );
882        assert_eq!(
883            ComponentIndex::extract_ecosystem("pkg:npm/@angular/core@14.0.0"),
884            Some("npm".to_string())
885        );
886        assert_eq!(
887            ComponentIndex::extract_ecosystem("pkg:cargo/serde@1.0.0"),
888            Some("cargo".to_string())
889        );
890    }
891
892    #[test]
893    fn test_normalize_name_pypi() {
894        assert_eq!(
895            ComponentIndex::normalize_name("Python_Dateutil", Some("pypi")),
896            "python-dateutil"
897        );
898        assert_eq!(
899            ComponentIndex::normalize_name("Some.Package", Some("pypi")),
900            "some-package"
901        );
902    }
903
904    #[test]
905    fn test_normalize_name_cargo() {
906        assert_eq!(
907            ComponentIndex::normalize_name("serde-json", Some("cargo")),
908            "serde_json"
909        );
910    }
911
912    #[test]
913    fn test_build_index() {
914        let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
915
916        let comp1 = make_component("requests", Some("pkg:pypi/requests@2.28.0"));
917        let comp2 = make_component("urllib3", Some("pkg:pypi/urllib3@1.26.0"));
918        let comp3 = make_component("serde", Some("pkg:cargo/serde@1.0.0"));
919
920        sbom.add_component(comp1);
921        sbom.add_component(comp2);
922        sbom.add_component(comp3);
923
924        let index = ComponentIndex::build(&sbom);
925
926        assert_eq!(index.len(), 3);
927        assert_eq!(index.by_ecosystem.get("pypi").map(|v| v.len()), Some(2));
928        assert_eq!(index.by_ecosystem.get("cargo").map(|v| v.len()), Some(1));
929    }
930
931    #[test]
932    fn test_find_candidates_same_ecosystem() {
933        let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
934
935        let comp1 = make_component("requests", Some("pkg:pypi/requests@2.28.0"));
936        let comp2 = make_component("urllib3", Some("pkg:pypi/urllib3@1.26.0"));
937        let comp3 = make_component("flask", Some("pkg:pypi/flask@2.0.0"));
938        let comp4 = make_component("serde", Some("pkg:cargo/serde@1.0.0"));
939
940        sbom.add_component(comp1.clone());
941        sbom.add_component(comp2);
942        sbom.add_component(comp3);
943        sbom.add_component(comp4);
944
945        let index = ComponentIndex::build(&sbom);
946
947        // Get the ID for requests
948        let requests_id = sbom
949            .components
950            .keys()
951            .find(|id| {
952                sbom.components
953                    .get(*id)
954                    .map(|c| c.name == "requests")
955                    .unwrap_or(false)
956            })
957            .unwrap();
958
959        let entry = index.get_entry(requests_id).unwrap();
960        let candidates = index.find_candidates(requests_id, entry, 10, 5);
961
962        // Should find pypi packages, not cargo packages
963        assert!(candidates.len() >= 2);
964        for cand_id in &candidates {
965            let cand_entry = index.get_entry(cand_id).unwrap();
966            assert_eq!(cand_entry.ecosystem, Some("pypi".to_string()));
967        }
968    }
969
970    #[test]
971    fn test_compute_trigrams() {
972        // Normal case
973        let trigrams = ComponentIndex::compute_trigrams("lodash");
974        assert_eq!(trigrams, vec!["lod", "oda", "das", "ash"]);
975
976        // Short name (< 3 chars)
977        let trigrams = ComponentIndex::compute_trigrams("ab");
978        assert_eq!(trigrams, vec!["ab"]);
979
980        // Empty name
981        let trigrams = ComponentIndex::compute_trigrams("");
982        assert!(trigrams.is_empty());
983
984        // Exactly 3 chars
985        let trigrams = ComponentIndex::compute_trigrams("abc");
986        assert_eq!(trigrams, vec!["abc"]);
987    }
988
989    #[test]
990    fn test_trigram_similarity() {
991        let entry_a = NormalizedEntry {
992            normalized_purl: None,
993            normalized_name: "lodash".to_string(),
994            name_length: 6,
995            ecosystem: None,
996            prefix: "lod".to_string(),
997            trigrams: vec![
998                "lod".to_string(),
999                "oda".to_string(),
1000                "das".to_string(),
1001                "ash".to_string(),
1002            ],
1003        };
1004
1005        let entry_b = NormalizedEntry {
1006            normalized_purl: None,
1007            normalized_name: "lodash-es".to_string(),
1008            name_length: 9,
1009            ecosystem: None,
1010            prefix: "lod".to_string(),
1011            trigrams: vec![
1012                "lod".to_string(),
1013                "oda".to_string(),
1014                "das".to_string(),
1015                "ash".to_string(),
1016                "sh-".to_string(),
1017                "h-e".to_string(),
1018                "-es".to_string(),
1019            ],
1020        };
1021
1022        let similarity = ComponentIndex::trigram_similarity(&entry_a, &entry_b);
1023        // lodash has 4 trigrams, lodash-es has 7, they share 4
1024        // Jaccard = 4 / 7 ≈ 0.57
1025        assert!(
1026            similarity > 0.5 && similarity < 0.6,
1027            "Expected ~0.57, got {}",
1028            similarity
1029        );
1030
1031        // Identical entries should have similarity 1.0
1032        let same_similarity = ComponentIndex::trigram_similarity(&entry_a, &entry_a);
1033        assert!((same_similarity - 1.0).abs() < f64::EPSILON);
1034
1035        // Completely different entries should have low similarity
1036        let entry_c = NormalizedEntry {
1037            normalized_purl: None,
1038            normalized_name: "react".to_string(),
1039            name_length: 5,
1040            ecosystem: None,
1041            prefix: "rea".to_string(),
1042            trigrams: vec!["rea".to_string(), "eac".to_string(), "act".to_string()],
1043        };
1044
1045        let diff_similarity = ComponentIndex::trigram_similarity(&entry_a, &entry_c);
1046        assert!(
1047            diff_similarity < 0.1,
1048            "Expected low similarity, got {}",
1049            diff_similarity
1050        );
1051    }
1052
1053    #[test]
1054    fn test_trigram_index_find_similar_suffix() {
1055        // Test that trigram indexing can find packages with different prefixes but similar content
1056        let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
1057
1058        // These packages share trigrams in the middle/end
1059        let comp1 = make_component("react-dom", Some("pkg:npm/react-dom@18.0.0"));
1060        let comp2 = make_component("preact-dom", Some("pkg:npm/preact-dom@10.0.0")); // shares "act", "-do", "dom"
1061        let comp3 = make_component("angular", Some("pkg:npm/angular@15.0.0")); // completely different
1062
1063        sbom.add_component(comp1.clone());
1064        sbom.add_component(comp2);
1065        sbom.add_component(comp3);
1066
1067        let index = ComponentIndex::build(&sbom);
1068
1069        // Find ID for react-dom
1070        let react_id = sbom
1071            .components
1072            .keys()
1073            .find(|id| {
1074                sbom.components
1075                    .get(*id)
1076                    .map(|c| c.name == "react-dom")
1077                    .unwrap_or(false)
1078            })
1079            .unwrap();
1080
1081        let entry = index.get_entry(react_id).unwrap();
1082
1083        // Should find preact-dom via trigram matching even though prefix differs
1084        let candidates = index.find_candidates(react_id, entry, 10, 5);
1085
1086        let preact_found = candidates.iter().any(|id| {
1087            index
1088                .get_entry(id)
1089                .map(|e| e.normalized_name.contains("preact"))
1090                .unwrap_or(false)
1091        });
1092
1093        assert!(preact_found, "Should find preact-dom via trigram matching");
1094    }
1095}
sbom_tools/matching/index.rs

sbom_tools/matching/
index.rs