1use crate::model::{CanonicalId, Component, NormalizedSbom};
7use rayon::prelude::*;
8use std::collections::{HashMap, HashSet};
9use std::sync::Arc;
10
11#[derive(Debug, Clone)]
13pub struct NormalizedEntry {
14 pub normalized_purl: Option<String>,
16 pub normalized_name: String,
18 pub name_length: usize,
20 pub ecosystem: Option<String>,
22 pub prefix: String,
24 pub trigrams: Vec<String>,
26}
27
28pub struct ComponentIndex {
38 by_ecosystem: HashMap<String, Vec<Arc<CanonicalId>>>,
40 by_prefix: HashMap<String, Vec<Arc<CanonicalId>>>,
42 by_trigram: HashMap<String, Vec<Arc<CanonicalId>>>,
44 entries: HashMap<Arc<CanonicalId>, NormalizedEntry>,
46 all_ids: Vec<Arc<CanonicalId>>,
48}
49
50impl ComponentIndex {
51 #[must_use]
56 pub fn build(sbom: &NormalizedSbom) -> Self {
57 let mut by_ecosystem: HashMap<String, Vec<Arc<CanonicalId>>> = HashMap::new();
58 let mut by_prefix: HashMap<String, Vec<Arc<CanonicalId>>> = HashMap::new();
59 let mut by_trigram: HashMap<String, Vec<Arc<CanonicalId>>> = HashMap::new();
60 let mut entries: HashMap<Arc<CanonicalId>, NormalizedEntry> = HashMap::new();
61 let mut all_ids: Vec<Arc<CanonicalId>> = Vec::new();
62
63 for (id, comp) in &sbom.components {
64 let entry = Self::normalize_component(comp);
65 let arc_id = Arc::new(id.clone());
67
68 if let Some(ref eco) = entry.ecosystem {
70 by_ecosystem
71 .entry(eco.clone())
72 .or_default()
73 .push(Arc::clone(&arc_id));
74 }
75
76 if !entry.prefix.is_empty() {
78 by_prefix
79 .entry(entry.prefix.clone())
80 .or_default()
81 .push(Arc::clone(&arc_id));
82 }
83
84 for trigram in &entry.trigrams {
86 by_trigram
87 .entry(trigram.clone())
88 .or_default()
89 .push(Arc::clone(&arc_id));
90 }
91
92 entries.insert(Arc::clone(&arc_id), entry);
93 all_ids.push(arc_id);
94 }
95
96 Self {
97 by_ecosystem,
98 by_prefix,
99 by_trigram,
100 entries,
101 all_ids,
102 }
103 }
104
105 #[must_use]
107 pub fn normalize_component(comp: &Component) -> NormalizedEntry {
108 let (ecosystem, normalized_purl) = comp.identifiers.purl.as_ref().map_or_else(
110 || {
111 (
114 comp.ecosystem
115 .as_ref()
116 .map(std::string::ToString::to_string),
117 None,
118 )
119 },
120 |purl| {
121 let eco = Self::extract_ecosystem(purl);
122 let normalized = Self::normalize_purl(purl);
123 (eco, Some(normalized))
124 },
125 );
126
127 let normalized_name = Self::normalize_name(&comp.name, ecosystem.as_deref());
129 let name_length = normalized_name.len();
130 let prefix = normalized_name.chars().take(3).collect::<String>();
131 let trigrams = Self::compute_trigrams(&normalized_name);
132
133 NormalizedEntry {
134 normalized_purl,
135 normalized_name,
136 name_length,
137 ecosystem,
138 prefix,
139 trigrams,
140 }
141 }
142
143 fn compute_trigrams(name: &str) -> Vec<String> {
148 if name.len() < 3 {
149 return if name.is_empty() {
151 vec![]
152 } else {
153 vec![name.to_string()]
154 };
155 }
156
157 if name.is_ascii() {
160 return name
161 .as_bytes()
162 .windows(3)
163 .map(|w| {
164 unsafe { std::str::from_utf8_unchecked(w) }.to_string()
167 })
168 .collect();
169 }
170
171 let chars: Vec<char> = name.chars().collect();
173 if chars.len() < 3 {
174 return vec![name.to_string()];
175 }
176
177 chars
178 .windows(3)
179 .map(|w| w.iter().collect::<String>())
180 .collect()
181 }
182
183 fn extract_ecosystem(purl: &str) -> Option<String> {
185 if let Some(rest) = purl.strip_prefix("pkg:")
187 && let Some(slash_pos) = rest.find('/')
188 {
189 return Some(rest[..slash_pos].to_lowercase());
190 }
191 None
192 }
193
194 fn normalize_purl(purl: &str) -> String {
196 let purl_lower = purl.to_lowercase();
198 if let Some(at_pos) = purl_lower.rfind('@') {
200 purl_lower[..at_pos].to_string()
201 } else {
202 purl_lower
203 }
204 }
205
206 #[must_use]
216 pub fn normalize_name(name: &str, ecosystem: Option<&str>) -> String {
217 let mut normalized = name.to_lowercase();
218
219 match ecosystem {
221 Some("pypi") => {
222 normalized = normalized.replace(['_', '.'], "-");
224 }
225 Some("cargo") => {
226 normalized = normalized.replace('-', "_");
228 }
229 Some("npm") => {
230 }
233 _ => {
234 normalized = normalized.replace('_', "-");
236 }
237 }
238
239 while normalized.contains("--") {
241 normalized = normalized.replace("--", "-");
242 }
243
244 normalized
245 }
246
247 #[must_use]
249 pub fn get_entry(&self, id: &CanonicalId) -> Option<&NormalizedEntry> {
250 self.entries.get(id)
252 }
253
254 #[must_use]
259 pub fn get_by_ecosystem(&self, ecosystem: &str) -> Option<Vec<CanonicalId>> {
260 self.by_ecosystem
261 .get(ecosystem)
262 .map(|v| v.iter().map(|arc| (**arc).clone()).collect())
263 }
264
265 #[must_use]
273 pub fn find_candidates(
274 &self,
275 source_id: &CanonicalId,
276 source_entry: &NormalizedEntry,
277 max_candidates: usize,
278 max_length_diff: usize,
279 ) -> Vec<CanonicalId> {
280 let mut candidates: Vec<Arc<CanonicalId>> = Vec::new();
281 let mut seen: HashSet<Arc<CanonicalId>> = HashSet::new();
282
283 let source_trigrams: HashSet<&str> =
285 source_entry.trigrams.iter().map(String::as_str).collect();
286
287 if let Some(ref eco) = source_entry.ecosystem
294 && let Some(ids) = self.by_ecosystem.get(eco)
295 {
296 let mut ranked: Vec<(usize, &Arc<CanonicalId>)> = Vec::new();
297 for id in ids {
298 if id.as_ref() != source_id
299 && !seen.contains(id)
300 && let Some(entry) = self.entries.get(id.as_ref())
301 {
302 let len_diff = (source_entry.name_length as i32 - entry.name_length as i32)
303 .unsigned_abs() as usize;
304 if len_diff <= max_length_diff {
305 let overlap = entry
306 .trigrams
307 .iter()
308 .filter(|t| source_trigrams.contains(t.as_str()))
309 .count();
310 ranked.push((overlap, id));
311 }
312 }
313 }
314 ranked.sort_by(|a, b| b.0.cmp(&a.0).then_with(|| a.1.value().cmp(b.1.value())));
316 for (_, id) in ranked {
317 candidates.push(Arc::clone(id));
318 seen.insert(Arc::clone(id));
319 }
320 }
321
322 if candidates.len() < max_candidates
324 && !source_entry.prefix.is_empty()
325 && let Some(ids) = self.by_prefix.get(&source_entry.prefix)
326 {
327 for id in ids {
328 if id.as_ref() != source_id
329 && !seen.contains(id)
330 && let Some(entry) = self.entries.get(id.as_ref())
331 {
332 let len_diff = (source_entry.name_length as i32 - entry.name_length as i32)
333 .unsigned_abs() as usize;
334 if len_diff <= max_length_diff {
335 candidates.push(Arc::clone(id));
336 seen.insert(Arc::clone(id));
337 }
338 }
339 if candidates.len() >= max_candidates {
340 break;
341 }
342 }
343 }
344
345 if candidates.len() < max_candidates && source_entry.prefix.len() >= 2 {
348 let prefix_2 = &source_entry.prefix[..2.min(source_entry.prefix.len())];
349 let mut similar_prefixes: Vec<_> = self
350 .by_prefix
351 .iter()
352 .filter(|(prefix, _)| {
353 prefix.starts_with(prefix_2) && *prefix != &source_entry.prefix
354 })
355 .collect();
356 similar_prefixes.sort_by(|a, b| a.0.cmp(b.0));
357
358 for (_prefix, ids) in similar_prefixes {
359 for id in ids {
360 if id.as_ref() != source_id
361 && !seen.contains(id)
362 && let Some(entry) = self.entries.get(id.as_ref())
363 {
364 let len_diff = (source_entry.name_length as i32 - entry.name_length as i32)
365 .unsigned_abs() as usize;
366 if len_diff <= max_length_diff {
367 candidates.push(Arc::clone(id));
368 seen.insert(Arc::clone(id));
369 }
370 }
371 if candidates.len() >= max_candidates {
372 break;
373 }
374 }
375 if candidates.len() >= max_candidates {
376 break;
377 }
378 }
379 }
380
381 if candidates.len() < max_candidates && !source_entry.trigrams.is_empty() {
384 let mut trigram_scores: HashMap<Arc<CanonicalId>, usize> = HashMap::new();
386
387 for trigram in &source_entry.trigrams {
388 if let Some(ids) = self.by_trigram.get(trigram) {
389 for id in ids {
390 if id.as_ref() != source_id && !seen.contains(id) {
391 *trigram_scores.entry(Arc::clone(id)).or_default() += 1;
392 }
393 }
394 }
395 }
396
397 let min_shared = if source_entry.trigrams.len() <= 2 {
399 1
400 } else {
401 2
402 };
403
404 let mut scored: Vec<_> = trigram_scores
407 .into_iter()
408 .filter(|(_, count)| *count >= min_shared)
409 .collect();
410 scored.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.value().cmp(b.0.value())));
411
412 for (id, _score) in scored {
413 if candidates.len() >= max_candidates {
414 break;
415 }
416 if let Some(entry) = self.entries.get(id.as_ref()) {
417 let len_diff = (source_entry.name_length as i32 - entry.name_length as i32)
418 .unsigned_abs() as usize;
419 if len_diff <= max_length_diff {
420 candidates.push(Arc::clone(&id));
421 seen.insert(id);
422 }
423 }
424 }
425 }
426
427 candidates.truncate(max_candidates);
429 candidates.into_iter().map(|arc| (*arc).clone()).collect()
430 }
431
432 #[must_use]
436 pub fn all_ids(&self) -> Vec<CanonicalId> {
437 self.all_ids.iter().map(|arc| (**arc).clone()).collect()
438 }
439
440 #[must_use]
442 pub fn len(&self) -> usize {
443 self.entries.len()
444 }
445
446 #[must_use]
448 pub fn is_empty(&self) -> bool {
449 self.entries.is_empty()
450 }
451
452 #[must_use]
459 pub fn find_candidates_parallel<'a>(
460 &self,
461 sources: &[(&'a CanonicalId, &NormalizedEntry)],
462 max_candidates: usize,
463 max_length_diff: usize,
464 ) -> Vec<(&'a CanonicalId, Vec<CanonicalId>)> {
465 sources
466 .par_iter()
467 .map(|(source_id, source_entry)| {
468 let candidates =
469 self.find_candidates(source_id, source_entry, max_candidates, max_length_diff);
470 (*source_id, candidates)
471 })
472 .collect()
473 }
474
475 #[must_use]
480 pub fn find_all_candidates_from(
481 &self,
482 other: &Self,
483 max_candidates: usize,
484 max_length_diff: usize,
485 ) -> Vec<(CanonicalId, Vec<CanonicalId>)> {
486 let sources: Vec<_> = other.entries.iter().collect();
487
488 sources
489 .par_iter()
490 .map(|(source_id, source_entry)| {
491 let candidates =
492 self.find_candidates(source_id, source_entry, max_candidates, max_length_diff);
493 ((*source_id).as_ref().clone(), candidates)
495 })
496 .collect::<Vec<_>>()
497 }
498
499 pub fn stats(&self) -> IndexStats {
501 let ecosystems = self.by_ecosystem.len();
502 let prefixes = self.by_prefix.len();
503 let trigrams = self.by_trigram.len();
504 let avg_per_ecosystem = if ecosystems > 0 {
505 self.by_ecosystem
506 .values()
507 .map(std::vec::Vec::len)
508 .sum::<usize>()
509 / ecosystems
510 } else {
511 0
512 };
513 let avg_per_prefix = if prefixes > 0 {
514 self.by_prefix
515 .values()
516 .map(std::vec::Vec::len)
517 .sum::<usize>()
518 / prefixes
519 } else {
520 0
521 };
522 let avg_per_trigram = if trigrams > 0 {
523 self.by_trigram
524 .values()
525 .map(std::vec::Vec::len)
526 .sum::<usize>()
527 / trigrams
528 } else {
529 0
530 };
531
532 IndexStats {
533 total_components: self.entries.len(),
534 ecosystems,
535 prefixes,
536 trigrams,
537 avg_per_ecosystem,
538 avg_per_prefix,
539 avg_per_trigram,
540 }
541 }
542
543 #[must_use]
547 pub fn trigram_similarity(entry_a: &NormalizedEntry, entry_b: &NormalizedEntry) -> f64 {
548 if entry_a.trigrams.is_empty() || entry_b.trigrams.is_empty() {
549 return 0.0;
550 }
551
552 let set_a: HashSet<_> = entry_a.trigrams.iter().collect();
553 let set_b: HashSet<_> = entry_b.trigrams.iter().collect();
554
555 let intersection = set_a.intersection(&set_b).count();
556 let union = set_a.union(&set_b).count();
557
558 if union == 0 {
559 0.0
560 } else {
561 intersection as f64 / union as f64
562 }
563 }
564}
565
566#[derive(Debug, Clone)]
568pub struct IndexStats {
569 pub total_components: usize,
571 pub ecosystems: usize,
573 pub prefixes: usize,
575 pub trigrams: usize,
577 pub avg_per_ecosystem: usize,
579 pub avg_per_prefix: usize,
581 pub avg_per_trigram: usize,
583}
584
585pub struct BatchCandidateGenerator {
594 component_index: ComponentIndex,
596 lsh_index: Option<super::lsh::LshIndex>,
598 cross_ecosystem_db: Option<super::cross_ecosystem::CrossEcosystemDb>,
600 config: BatchCandidateConfig,
602}
603
604#[derive(Debug, Clone)]
606pub struct BatchCandidateConfig {
607 pub max_candidates: usize,
609 pub max_length_diff: usize,
611 pub lsh_threshold: usize,
613 pub enable_cross_ecosystem: bool,
615}
616
617impl Default for BatchCandidateConfig {
618 fn default() -> Self {
619 Self {
620 max_candidates: 100,
621 max_length_diff: 5,
622 lsh_threshold: 500, enable_cross_ecosystem: true,
624 }
625 }
626}
627
628#[derive(Debug)]
630pub struct BatchCandidateResult {
631 pub source_id: CanonicalId,
633 pub index_candidates: Vec<CanonicalId>,
635 pub lsh_candidates: Vec<CanonicalId>,
637 pub cross_ecosystem_candidates: Vec<CanonicalId>,
639 pub total_unique: usize,
641}
642
643impl BatchCandidateGenerator {
644 #[must_use]
646 pub fn build(sbom: &NormalizedSbom, config: BatchCandidateConfig) -> Self {
647 let component_index = ComponentIndex::build(sbom);
648
649 let lsh_index = if sbom.component_count() >= config.lsh_threshold {
651 Some(super::lsh::LshIndex::build(
652 sbom,
653 super::lsh::LshConfig::default(),
654 ))
655 } else {
656 None
657 };
658
659 let cross_ecosystem_db = if config.enable_cross_ecosystem {
661 Some(super::cross_ecosystem::CrossEcosystemDb::with_builtin_mappings())
662 } else {
663 None
664 };
665
666 Self {
667 component_index,
668 lsh_index,
669 cross_ecosystem_db,
670 config,
671 }
672 }
673
674 pub fn find_candidates(
676 &self,
677 source_id: &CanonicalId,
678 source_component: &Component,
679 ) -> BatchCandidateResult {
680 let mut seen: HashSet<CanonicalId> = HashSet::new();
681
682 let source_entry = self.component_index.get_entry(source_id).map_or_else(
684 || {
685 ComponentIndex::normalize_component(source_component)
687 },
688 NormalizedEntry::clone,
689 );
690
691 let index_candidates = self.component_index.find_candidates(
693 source_id,
694 &source_entry,
695 self.config.max_candidates,
696 self.config.max_length_diff,
697 );
698 for id in &index_candidates {
699 seen.insert(id.clone());
700 }
701
702 let lsh_candidates: Vec<CanonicalId> =
704 self.lsh_index.as_ref().map_or_else(Vec::new, |lsh| {
705 let candidates: Vec<_> = lsh
706 .find_candidates(source_component)
707 .into_iter()
708 .filter(|id| id != source_id && !seen.contains(id))
709 .take(self.config.max_candidates / 2) .collect();
711 for id in &candidates {
712 seen.insert(id.clone());
713 }
714 candidates
715 });
716
717 let cross_ecosystem_candidates: Vec<CanonicalId> = if let (Some(db), Some(eco)) =
719 (&self.cross_ecosystem_db, &source_component.ecosystem)
720 {
721 let candidates: Vec<_> = db
722 .find_equivalents(eco, &source_component.name)
723 .into_iter()
724 .flat_map(|m| {
725 let target_eco_str = m.target_ecosystem.to_string().to_lowercase();
727 self.component_index
728 .get_by_ecosystem(&target_eco_str)
729 .unwrap_or_default()
730 })
731 .filter(|id| id != source_id && !seen.contains(id))
732 .take(self.config.max_candidates / 4) .collect();
734 for id in &candidates {
735 seen.insert(id.clone());
736 }
737 candidates
738 } else {
739 Vec::new()
740 };
741
742 let total_unique = seen.len();
743
744 BatchCandidateResult {
745 source_id: source_id.clone(),
746 index_candidates,
747 lsh_candidates,
748 cross_ecosystem_candidates,
749 total_unique,
750 }
751 }
752
753 #[must_use]
755 pub fn find_candidates_batch(
756 &self,
757 sources: &[(&CanonicalId, &Component)],
758 ) -> Vec<BatchCandidateResult> {
759 sources
760 .par_iter()
761 .map(|(id, comp)| self.find_candidates(id, comp))
762 .collect()
763 }
764
765 #[must_use]
767 pub fn all_candidates(
768 &self,
769 source_id: &CanonicalId,
770 source_component: &Component,
771 ) -> Vec<CanonicalId> {
772 let result = self.find_candidates(source_id, source_component);
773 let mut all: Vec<_> = result.index_candidates;
774 all.extend(result.lsh_candidates);
775 all.extend(result.cross_ecosystem_candidates);
776 all
777 }
778
779 #[must_use]
781 pub const fn component_index(&self) -> &ComponentIndex {
782 &self.component_index
783 }
784
785 #[must_use]
787 pub const fn has_lsh(&self) -> bool {
788 self.lsh_index.is_some()
789 }
790
791 #[must_use]
793 pub const fn has_cross_ecosystem(&self) -> bool {
794 self.cross_ecosystem_db.is_some()
795 }
796
797 pub fn stats(&self) -> BatchCandidateStats {
799 BatchCandidateStats {
800 index_stats: self.component_index.stats(),
801 lsh_enabled: self.lsh_index.is_some(),
802 lsh_stats: self.lsh_index.as_ref().map(super::lsh::LshIndex::stats),
803 cross_ecosystem_enabled: self.cross_ecosystem_db.is_some(),
804 }
805 }
806}
807
808#[derive(Debug)]
810pub struct BatchCandidateStats {
811 pub index_stats: IndexStats,
813 pub lsh_enabled: bool,
815 pub lsh_stats: Option<super::lsh::LshIndexStats>,
817 pub cross_ecosystem_enabled: bool,
819}
820
821pub struct LazyComponentIndex {
826 sbom: Option<std::sync::Arc<NormalizedSbom>>,
828 index: std::sync::OnceLock<ComponentIndex>,
830}
831
832impl LazyComponentIndex {
833 #[must_use]
835 pub const fn new(sbom: std::sync::Arc<NormalizedSbom>) -> Self {
836 Self {
837 sbom: Some(sbom),
838 index: std::sync::OnceLock::new(),
839 }
840 }
841
842 #[must_use]
844 pub fn from_index(index: ComponentIndex) -> Self {
845 let lazy = Self {
846 sbom: None,
847 index: std::sync::OnceLock::new(),
848 };
849 let _ = lazy.index.set(index);
850 lazy
851 }
852
853 pub fn get(&self) -> &ComponentIndex {
858 self.index.get_or_init(|| {
859 self.sbom.as_ref().map_or_else(
860 || {
861 ComponentIndex::build(&NormalizedSbom::default())
863 },
864 |sbom| ComponentIndex::build(sbom),
865 )
866 })
867 }
868
869 pub fn is_built(&self) -> bool {
871 self.index.get().is_some()
872 }
873
874 pub fn try_get(&self) -> Option<&ComponentIndex> {
876 self.index.get()
877 }
878}
879
880impl std::ops::Deref for LazyComponentIndex {
881 type Target = ComponentIndex;
882
883 fn deref(&self) -> &Self::Target {
884 self.get()
885 }
886}
887
888#[cfg(test)]
889mod tests {
890 use super::*;
891 use crate::model::{DocumentMetadata, Ecosystem};
892
893 fn make_component(name: &str, purl: Option<&str>) -> Component {
894 let mut comp = Component::new(name.to_string(), format!("test-{}", name));
895 comp.version = Some("1.0.0".to_string());
896 comp.identifiers.purl = purl.map(|s| s.to_string());
897 comp.ecosystem = purl
899 .and_then(ComponentIndex::extract_ecosystem)
900 .map(|eco_str| Ecosystem::from_purl_type(&eco_str));
901 comp
902 }
903
904 #[test]
905 fn test_extract_ecosystem() {
906 assert_eq!(
907 ComponentIndex::extract_ecosystem("pkg:pypi/requests@2.28.0"),
908 Some("pypi".to_string())
909 );
910 assert_eq!(
911 ComponentIndex::extract_ecosystem("pkg:npm/@angular/core@14.0.0"),
912 Some("npm".to_string())
913 );
914 assert_eq!(
915 ComponentIndex::extract_ecosystem("pkg:cargo/serde@1.0.0"),
916 Some("cargo".to_string())
917 );
918 }
919
920 #[test]
921 fn test_normalize_name_pypi() {
922 assert_eq!(
923 ComponentIndex::normalize_name("Python_Dateutil", Some("pypi")),
924 "python-dateutil"
925 );
926 assert_eq!(
927 ComponentIndex::normalize_name("Some.Package", Some("pypi")),
928 "some-package"
929 );
930 }
931
932 #[test]
933 fn test_normalize_name_cargo() {
934 assert_eq!(
935 ComponentIndex::normalize_name("serde-json", Some("cargo")),
936 "serde_json"
937 );
938 }
939
940 #[test]
941 fn test_build_index() {
942 let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
943
944 let comp1 = make_component("requests", Some("pkg:pypi/requests@2.28.0"));
945 let comp2 = make_component("urllib3", Some("pkg:pypi/urllib3@1.26.0"));
946 let comp3 = make_component("serde", Some("pkg:cargo/serde@1.0.0"));
947
948 sbom.add_component(comp1);
949 sbom.add_component(comp2);
950 sbom.add_component(comp3);
951
952 let index = ComponentIndex::build(&sbom);
953
954 assert_eq!(index.len(), 3);
955 assert_eq!(index.by_ecosystem.get("pypi").map(|v| v.len()), Some(2));
956 assert_eq!(index.by_ecosystem.get("cargo").map(|v| v.len()), Some(1));
957 }
958
959 #[test]
960 fn test_find_candidates_same_ecosystem() {
961 let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
962
963 let comp1 = make_component("requests", Some("pkg:pypi/requests@2.28.0"));
964 let comp2 = make_component("urllib3", Some("pkg:pypi/urllib3@1.26.0"));
965 let comp3 = make_component("flask", Some("pkg:pypi/flask@2.0.0"));
966 let comp4 = make_component("serde", Some("pkg:cargo/serde@1.0.0"));
967
968 sbom.add_component(comp1.clone());
969 sbom.add_component(comp2);
970 sbom.add_component(comp3);
971 sbom.add_component(comp4);
972
973 let index = ComponentIndex::build(&sbom);
974
975 let requests_id = sbom
977 .components
978 .keys()
979 .find(|id| {
980 sbom.components
981 .get(*id)
982 .map(|c| c.name == "requests")
983 .unwrap_or(false)
984 })
985 .unwrap();
986
987 let entry = index.get_entry(requests_id).unwrap();
988 let candidates = index.find_candidates(requests_id, entry, 10, 5);
989
990 assert!(candidates.len() >= 2);
992 for cand_id in &candidates {
993 let cand_entry = index.get_entry(cand_id).unwrap();
994 assert_eq!(cand_entry.ecosystem, Some("pypi".to_string()));
995 }
996 }
997
998 #[test]
999 fn test_compute_trigrams() {
1000 let trigrams = ComponentIndex::compute_trigrams("lodash");
1002 assert_eq!(trigrams, vec!["lod", "oda", "das", "ash"]);
1003
1004 let trigrams = ComponentIndex::compute_trigrams("ab");
1006 assert_eq!(trigrams, vec!["ab"]);
1007
1008 let trigrams = ComponentIndex::compute_trigrams("");
1010 assert!(trigrams.is_empty());
1011
1012 let trigrams = ComponentIndex::compute_trigrams("abc");
1014 assert_eq!(trigrams, vec!["abc"]);
1015 }
1016
1017 #[test]
1018 fn test_trigram_similarity() {
1019 let entry_a = NormalizedEntry {
1020 normalized_purl: None,
1021 normalized_name: "lodash".to_string(),
1022 name_length: 6,
1023 ecosystem: None,
1024 prefix: "lod".to_string(),
1025 trigrams: vec![
1026 "lod".to_string(),
1027 "oda".to_string(),
1028 "das".to_string(),
1029 "ash".to_string(),
1030 ],
1031 };
1032
1033 let entry_b = NormalizedEntry {
1034 normalized_purl: None,
1035 normalized_name: "lodash-es".to_string(),
1036 name_length: 9,
1037 ecosystem: None,
1038 prefix: "lod".to_string(),
1039 trigrams: vec![
1040 "lod".to_string(),
1041 "oda".to_string(),
1042 "das".to_string(),
1043 "ash".to_string(),
1044 "sh-".to_string(),
1045 "h-e".to_string(),
1046 "-es".to_string(),
1047 ],
1048 };
1049
1050 let similarity = ComponentIndex::trigram_similarity(&entry_a, &entry_b);
1051 assert!(
1054 similarity > 0.5 && similarity < 0.6,
1055 "Expected ~0.57, got {}",
1056 similarity
1057 );
1058
1059 let same_similarity = ComponentIndex::trigram_similarity(&entry_a, &entry_a);
1061 assert!((same_similarity - 1.0).abs() < f64::EPSILON);
1062
1063 let entry_c = NormalizedEntry {
1065 normalized_purl: None,
1066 normalized_name: "react".to_string(),
1067 name_length: 5,
1068 ecosystem: None,
1069 prefix: "rea".to_string(),
1070 trigrams: vec!["rea".to_string(), "eac".to_string(), "act".to_string()],
1071 };
1072
1073 let diff_similarity = ComponentIndex::trigram_similarity(&entry_a, &entry_c);
1074 assert!(
1075 diff_similarity < 0.1,
1076 "Expected low similarity, got {}",
1077 diff_similarity
1078 );
1079 }
1080
1081 #[test]
1082 fn test_trigram_index_find_similar_suffix() {
1083 let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
1085
1086 let comp1 = make_component("react-dom", Some("pkg:npm/react-dom@18.0.0"));
1088 let comp2 = make_component("preact-dom", Some("pkg:npm/preact-dom@10.0.0")); let comp3 = make_component("angular", Some("pkg:npm/angular@15.0.0")); sbom.add_component(comp1.clone());
1092 sbom.add_component(comp2);
1093 sbom.add_component(comp3);
1094
1095 let index = ComponentIndex::build(&sbom);
1096
1097 let react_id = sbom
1099 .components
1100 .keys()
1101 .find(|id| {
1102 sbom.components
1103 .get(*id)
1104 .map(|c| c.name == "react-dom")
1105 .unwrap_or(false)
1106 })
1107 .unwrap();
1108
1109 let entry = index.get_entry(react_id).unwrap();
1110
1111 let candidates = index.find_candidates(react_id, entry, 10, 5);
1113
1114 let preact_found = candidates.iter().any(|id| {
1115 index
1116 .get_entry(id)
1117 .map(|e| e.normalized_name.contains("preact"))
1118 .unwrap_or(false)
1119 });
1120
1121 assert!(preact_found, "Should find preact-dom via trigram matching");
1122 }
1123}