1use crate::model::{CanonicalId, Component, NormalizedSbom};
7use rayon::prelude::*;
8use std::collections::{HashMap, HashSet};
9use std::sync::Arc;
10
11#[derive(Debug, Clone)]
13pub struct NormalizedEntry {
14 pub normalized_purl: Option<String>,
16 pub normalized_name: String,
18 pub name_length: usize,
20 pub ecosystem: Option<String>,
22 pub prefix: String,
24 pub trigrams: Vec<String>,
26}
27
28pub struct ComponentIndex {
38 by_ecosystem: HashMap<String, Vec<Arc<CanonicalId>>>,
40 by_prefix: HashMap<String, Vec<Arc<CanonicalId>>>,
42 by_trigram: HashMap<String, Vec<Arc<CanonicalId>>>,
44 entries: HashMap<Arc<CanonicalId>, NormalizedEntry>,
46 all_ids: Vec<Arc<CanonicalId>>,
48}
49
50impl ComponentIndex {
51 #[must_use]
56 pub fn build(sbom: &NormalizedSbom) -> Self {
57 let mut by_ecosystem: HashMap<String, Vec<Arc<CanonicalId>>> = HashMap::new();
58 let mut by_prefix: HashMap<String, Vec<Arc<CanonicalId>>> = HashMap::new();
59 let mut by_trigram: HashMap<String, Vec<Arc<CanonicalId>>> = HashMap::new();
60 let mut entries: HashMap<Arc<CanonicalId>, NormalizedEntry> = HashMap::new();
61 let mut all_ids: Vec<Arc<CanonicalId>> = Vec::new();
62
63 for (id, comp) in &sbom.components {
64 let entry = Self::normalize_component(comp);
65 let arc_id = Arc::new(id.clone());
67
68 if let Some(ref eco) = entry.ecosystem {
70 by_ecosystem
71 .entry(eco.clone())
72 .or_default()
73 .push(Arc::clone(&arc_id));
74 }
75
76 if !entry.prefix.is_empty() {
78 by_prefix
79 .entry(entry.prefix.clone())
80 .or_default()
81 .push(Arc::clone(&arc_id));
82 }
83
84 for trigram in &entry.trigrams {
86 by_trigram
87 .entry(trigram.clone())
88 .or_default()
89 .push(Arc::clone(&arc_id));
90 }
91
92 entries.insert(Arc::clone(&arc_id), entry);
93 all_ids.push(arc_id);
94 }
95
96 Self {
97 by_ecosystem,
98 by_prefix,
99 by_trigram,
100 entries,
101 all_ids,
102 }
103 }
104
105 #[must_use]
107 pub fn normalize_component(comp: &Component) -> NormalizedEntry {
108 let (ecosystem, normalized_purl) = comp.identifiers.purl.as_ref().map_or_else(
110 || {
111 (
114 comp.ecosystem
115 .as_ref()
116 .map(std::string::ToString::to_string),
117 None,
118 )
119 },
120 |purl| {
121 let eco = Self::extract_ecosystem(purl);
122 let normalized = Self::normalize_purl(purl);
123 (eco, Some(normalized))
124 },
125 );
126
127 let normalized_name = Self::normalize_name(&comp.name, ecosystem.as_deref());
129 let name_length = normalized_name.len();
130 let prefix = normalized_name.chars().take(3).collect::<String>();
131 let trigrams = Self::compute_trigrams(&normalized_name);
132
133 NormalizedEntry {
134 normalized_purl,
135 normalized_name,
136 name_length,
137 ecosystem,
138 prefix,
139 trigrams,
140 }
141 }
142
143 fn compute_trigrams(name: &str) -> Vec<String> {
148 if name.len() < 3 {
149 return if name.is_empty() {
151 vec![]
152 } else {
153 vec![name.to_string()]
154 };
155 }
156
157 if name.is_ascii() {
160 return name
161 .as_bytes()
162 .windows(3)
163 .map(|w| {
164 unsafe { std::str::from_utf8_unchecked(w) }.to_string()
167 })
168 .collect();
169 }
170
171 let chars: Vec<char> = name.chars().collect();
173 if chars.len() < 3 {
174 return vec![name.to_string()];
175 }
176
177 chars
178 .windows(3)
179 .map(|w| w.iter().collect::<String>())
180 .collect()
181 }
182
183 fn extract_ecosystem(purl: &str) -> Option<String> {
185 if let Some(rest) = purl.strip_prefix("pkg:")
187 && let Some(slash_pos) = rest.find('/')
188 {
189 return Some(rest[..slash_pos].to_lowercase());
190 }
191 None
192 }
193
194 fn normalize_purl(purl: &str) -> String {
196 let purl_lower = purl.to_lowercase();
198 if let Some(at_pos) = purl_lower.rfind('@') {
200 purl_lower[..at_pos].to_string()
201 } else {
202 purl_lower
203 }
204 }
205
206 #[must_use]
216 pub fn normalize_name(name: &str, ecosystem: Option<&str>) -> String {
217 let mut normalized = name.to_lowercase();
218
219 match ecosystem {
221 Some("pypi") => {
222 normalized = normalized.replace(['_', '.'], "-");
224 }
225 Some("cargo") => {
226 normalized = normalized.replace('-', "_");
228 }
229 Some("npm") => {
230 }
233 _ => {
234 normalized = normalized.replace('_', "-");
236 }
237 }
238
239 while normalized.contains("--") {
241 normalized = normalized.replace("--", "-");
242 }
243
244 normalized
245 }
246
247 #[must_use]
249 pub fn get_entry(&self, id: &CanonicalId) -> Option<&NormalizedEntry> {
250 self.entries.get(id)
252 }
253
254 #[must_use]
259 pub fn get_by_ecosystem(&self, ecosystem: &str) -> Option<Vec<CanonicalId>> {
260 self.by_ecosystem
261 .get(ecosystem)
262 .map(|v| v.iter().map(|arc| (**arc).clone()).collect())
263 }
264
265 #[must_use]
273 pub fn find_candidates(
274 &self,
275 source_id: &CanonicalId,
276 source_entry: &NormalizedEntry,
277 max_candidates: usize,
278 max_length_diff: usize,
279 ) -> Vec<CanonicalId> {
280 let mut candidates: Vec<Arc<CanonicalId>> = Vec::new();
281 let mut seen: HashSet<Arc<CanonicalId>> = HashSet::new();
282
283 if let Some(ref eco) = source_entry.ecosystem
285 && let Some(ids) = self.by_ecosystem.get(eco)
286 {
287 for id in ids {
288 if id.as_ref() != source_id && !seen.contains(id) {
289 if let Some(entry) = self.entries.get(id.as_ref()) {
291 let len_diff = (source_entry.name_length as i32 - entry.name_length as i32)
292 .unsigned_abs() as usize;
293 if len_diff <= max_length_diff {
294 candidates.push(Arc::clone(id));
295 seen.insert(Arc::clone(id));
296 }
297 }
298 }
299 }
300 }
301
302 if candidates.len() < max_candidates
304 && !source_entry.prefix.is_empty()
305 && let Some(ids) = self.by_prefix.get(&source_entry.prefix)
306 {
307 for id in ids {
308 if id.as_ref() != source_id
309 && !seen.contains(id)
310 && let Some(entry) = self.entries.get(id.as_ref())
311 {
312 let len_diff = (source_entry.name_length as i32 - entry.name_length as i32)
313 .unsigned_abs() as usize;
314 if len_diff <= max_length_diff {
315 candidates.push(Arc::clone(id));
316 seen.insert(Arc::clone(id));
317 }
318 }
319 if candidates.len() >= max_candidates {
320 break;
321 }
322 }
323 }
324
325 if candidates.len() < max_candidates && source_entry.prefix.len() >= 2 {
327 let prefix_2 = &source_entry.prefix[..2.min(source_entry.prefix.len())];
328 for (prefix, ids) in &self.by_prefix {
329 if prefix.starts_with(prefix_2) && prefix != &source_entry.prefix {
330 for id in ids {
331 if id.as_ref() != source_id
332 && !seen.contains(id)
333 && let Some(entry) = self.entries.get(id.as_ref())
334 {
335 let len_diff = (source_entry.name_length as i32
336 - entry.name_length as i32)
337 .unsigned_abs() as usize;
338 if len_diff <= max_length_diff {
339 candidates.push(Arc::clone(id));
340 seen.insert(Arc::clone(id));
341 }
342 }
343 if candidates.len() >= max_candidates {
344 break;
345 }
346 }
347 }
348 if candidates.len() >= max_candidates {
349 break;
350 }
351 }
352 }
353
354 if candidates.len() < max_candidates && !source_entry.trigrams.is_empty() {
357 let mut trigram_scores: HashMap<Arc<CanonicalId>, usize> = HashMap::new();
359
360 for trigram in &source_entry.trigrams {
361 if let Some(ids) = self.by_trigram.get(trigram) {
362 for id in ids {
363 if id.as_ref() != source_id && !seen.contains(id) {
364 *trigram_scores.entry(Arc::clone(id)).or_default() += 1;
365 }
366 }
367 }
368 }
369
370 let min_shared = if source_entry.trigrams.len() <= 2 {
372 1
373 } else {
374 2
375 };
376
377 let mut scored: Vec<_> = trigram_scores
379 .into_iter()
380 .filter(|(_, count)| *count >= min_shared)
381 .collect();
382 scored.sort_by(|a, b| b.1.cmp(&a.1));
383
384 for (id, _score) in scored {
385 if candidates.len() >= max_candidates {
386 break;
387 }
388 if let Some(entry) = self.entries.get(id.as_ref()) {
389 let len_diff = (source_entry.name_length as i32 - entry.name_length as i32)
390 .unsigned_abs() as usize;
391 if len_diff <= max_length_diff {
392 candidates.push(Arc::clone(&id));
393 seen.insert(id);
394 }
395 }
396 }
397 }
398
399 candidates.truncate(max_candidates);
401 candidates.into_iter().map(|arc| (*arc).clone()).collect()
402 }
403
404 #[must_use]
408 pub fn all_ids(&self) -> Vec<CanonicalId> {
409 self.all_ids.iter().map(|arc| (**arc).clone()).collect()
410 }
411
412 #[must_use]
414 pub fn len(&self) -> usize {
415 self.entries.len()
416 }
417
418 #[must_use]
420 pub fn is_empty(&self) -> bool {
421 self.entries.is_empty()
422 }
423
424 #[must_use]
431 pub fn find_candidates_parallel<'a>(
432 &self,
433 sources: &[(&'a CanonicalId, &NormalizedEntry)],
434 max_candidates: usize,
435 max_length_diff: usize,
436 ) -> Vec<(&'a CanonicalId, Vec<CanonicalId>)> {
437 sources
438 .par_iter()
439 .map(|(source_id, source_entry)| {
440 let candidates =
441 self.find_candidates(source_id, source_entry, max_candidates, max_length_diff);
442 (*source_id, candidates)
443 })
444 .collect()
445 }
446
447 #[must_use]
452 pub fn find_all_candidates_from(
453 &self,
454 other: &Self,
455 max_candidates: usize,
456 max_length_diff: usize,
457 ) -> Vec<(CanonicalId, Vec<CanonicalId>)> {
458 let sources: Vec<_> = other.entries.iter().collect();
459
460 sources
461 .par_iter()
462 .map(|(source_id, source_entry)| {
463 let candidates =
464 self.find_candidates(source_id, source_entry, max_candidates, max_length_diff);
465 ((*source_id).as_ref().clone(), candidates)
467 })
468 .collect::<Vec<_>>()
469 }
470
471 pub fn stats(&self) -> IndexStats {
473 let ecosystems = self.by_ecosystem.len();
474 let prefixes = self.by_prefix.len();
475 let trigrams = self.by_trigram.len();
476 let avg_per_ecosystem = if ecosystems > 0 {
477 self.by_ecosystem
478 .values()
479 .map(std::vec::Vec::len)
480 .sum::<usize>()
481 / ecosystems
482 } else {
483 0
484 };
485 let avg_per_prefix = if prefixes > 0 {
486 self.by_prefix
487 .values()
488 .map(std::vec::Vec::len)
489 .sum::<usize>()
490 / prefixes
491 } else {
492 0
493 };
494 let avg_per_trigram = if trigrams > 0 {
495 self.by_trigram
496 .values()
497 .map(std::vec::Vec::len)
498 .sum::<usize>()
499 / trigrams
500 } else {
501 0
502 };
503
504 IndexStats {
505 total_components: self.entries.len(),
506 ecosystems,
507 prefixes,
508 trigrams,
509 avg_per_ecosystem,
510 avg_per_prefix,
511 avg_per_trigram,
512 }
513 }
514
515 #[must_use]
519 pub fn trigram_similarity(entry_a: &NormalizedEntry, entry_b: &NormalizedEntry) -> f64 {
520 if entry_a.trigrams.is_empty() || entry_b.trigrams.is_empty() {
521 return 0.0;
522 }
523
524 let set_a: HashSet<_> = entry_a.trigrams.iter().collect();
525 let set_b: HashSet<_> = entry_b.trigrams.iter().collect();
526
527 let intersection = set_a.intersection(&set_b).count();
528 let union = set_a.union(&set_b).count();
529
530 if union == 0 {
531 0.0
532 } else {
533 intersection as f64 / union as f64
534 }
535 }
536}
537
538#[derive(Debug, Clone)]
540pub struct IndexStats {
541 pub total_components: usize,
543 pub ecosystems: usize,
545 pub prefixes: usize,
547 pub trigrams: usize,
549 pub avg_per_ecosystem: usize,
551 pub avg_per_prefix: usize,
553 pub avg_per_trigram: usize,
555}
556
557pub struct BatchCandidateGenerator {
566 component_index: ComponentIndex,
568 lsh_index: Option<super::lsh::LshIndex>,
570 cross_ecosystem_db: Option<super::cross_ecosystem::CrossEcosystemDb>,
572 config: BatchCandidateConfig,
574}
575
576#[derive(Debug, Clone)]
578pub struct BatchCandidateConfig {
579 pub max_candidates: usize,
581 pub max_length_diff: usize,
583 pub lsh_threshold: usize,
585 pub enable_cross_ecosystem: bool,
587}
588
589impl Default for BatchCandidateConfig {
590 fn default() -> Self {
591 Self {
592 max_candidates: 100,
593 max_length_diff: 5,
594 lsh_threshold: 500, enable_cross_ecosystem: true,
596 }
597 }
598}
599
600#[derive(Debug)]
602pub struct BatchCandidateResult {
603 pub source_id: CanonicalId,
605 pub index_candidates: Vec<CanonicalId>,
607 pub lsh_candidates: Vec<CanonicalId>,
609 pub cross_ecosystem_candidates: Vec<CanonicalId>,
611 pub total_unique: usize,
613}
614
615impl BatchCandidateGenerator {
616 #[must_use]
618 pub fn build(sbom: &NormalizedSbom, config: BatchCandidateConfig) -> Self {
619 let component_index = ComponentIndex::build(sbom);
620
621 let lsh_index = if sbom.component_count() >= config.lsh_threshold {
623 Some(super::lsh::LshIndex::build(
624 sbom,
625 super::lsh::LshConfig::default(),
626 ))
627 } else {
628 None
629 };
630
631 let cross_ecosystem_db = if config.enable_cross_ecosystem {
633 Some(super::cross_ecosystem::CrossEcosystemDb::with_builtin_mappings())
634 } else {
635 None
636 };
637
638 Self {
639 component_index,
640 lsh_index,
641 cross_ecosystem_db,
642 config,
643 }
644 }
645
646 pub fn find_candidates(
648 &self,
649 source_id: &CanonicalId,
650 source_component: &Component,
651 ) -> BatchCandidateResult {
652 let mut seen: HashSet<CanonicalId> = HashSet::new();
653
654 let source_entry = self.component_index.get_entry(source_id).map_or_else(
656 || {
657 ComponentIndex::normalize_component(source_component)
659 },
660 NormalizedEntry::clone,
661 );
662
663 let index_candidates = self.component_index.find_candidates(
665 source_id,
666 &source_entry,
667 self.config.max_candidates,
668 self.config.max_length_diff,
669 );
670 for id in &index_candidates {
671 seen.insert(id.clone());
672 }
673
674 let lsh_candidates: Vec<CanonicalId> =
676 self.lsh_index.as_ref().map_or_else(Vec::new, |lsh| {
677 let candidates: Vec<_> = lsh
678 .find_candidates(source_component)
679 .into_iter()
680 .filter(|id| id != source_id && !seen.contains(id))
681 .take(self.config.max_candidates / 2) .collect();
683 for id in &candidates {
684 seen.insert(id.clone());
685 }
686 candidates
687 });
688
689 let cross_ecosystem_candidates: Vec<CanonicalId> = if let (Some(db), Some(eco)) =
691 (&self.cross_ecosystem_db, &source_component.ecosystem)
692 {
693 let candidates: Vec<_> = db
694 .find_equivalents(eco, &source_component.name)
695 .into_iter()
696 .flat_map(|m| {
697 let target_eco_str = m.target_ecosystem.to_string().to_lowercase();
699 self.component_index
700 .get_by_ecosystem(&target_eco_str)
701 .unwrap_or_default()
702 })
703 .filter(|id| id != source_id && !seen.contains(id))
704 .take(self.config.max_candidates / 4) .collect();
706 for id in &candidates {
707 seen.insert(id.clone());
708 }
709 candidates
710 } else {
711 Vec::new()
712 };
713
714 let total_unique = seen.len();
715
716 BatchCandidateResult {
717 source_id: source_id.clone(),
718 index_candidates,
719 lsh_candidates,
720 cross_ecosystem_candidates,
721 total_unique,
722 }
723 }
724
725 #[must_use]
727 pub fn find_candidates_batch(
728 &self,
729 sources: &[(&CanonicalId, &Component)],
730 ) -> Vec<BatchCandidateResult> {
731 sources
732 .par_iter()
733 .map(|(id, comp)| self.find_candidates(id, comp))
734 .collect()
735 }
736
737 #[must_use]
739 pub fn all_candidates(
740 &self,
741 source_id: &CanonicalId,
742 source_component: &Component,
743 ) -> Vec<CanonicalId> {
744 let result = self.find_candidates(source_id, source_component);
745 let mut all: Vec<_> = result.index_candidates;
746 all.extend(result.lsh_candidates);
747 all.extend(result.cross_ecosystem_candidates);
748 all
749 }
750
751 #[must_use]
753 pub const fn component_index(&self) -> &ComponentIndex {
754 &self.component_index
755 }
756
757 #[must_use]
759 pub const fn has_lsh(&self) -> bool {
760 self.lsh_index.is_some()
761 }
762
763 #[must_use]
765 pub const fn has_cross_ecosystem(&self) -> bool {
766 self.cross_ecosystem_db.is_some()
767 }
768
769 pub fn stats(&self) -> BatchCandidateStats {
771 BatchCandidateStats {
772 index_stats: self.component_index.stats(),
773 lsh_enabled: self.lsh_index.is_some(),
774 lsh_stats: self.lsh_index.as_ref().map(super::lsh::LshIndex::stats),
775 cross_ecosystem_enabled: self.cross_ecosystem_db.is_some(),
776 }
777 }
778}
779
780#[derive(Debug)]
782pub struct BatchCandidateStats {
783 pub index_stats: IndexStats,
785 pub lsh_enabled: bool,
787 pub lsh_stats: Option<super::lsh::LshIndexStats>,
789 pub cross_ecosystem_enabled: bool,
791}
792
793pub struct LazyComponentIndex {
798 sbom: Option<std::sync::Arc<NormalizedSbom>>,
800 index: std::sync::OnceLock<ComponentIndex>,
802}
803
804impl LazyComponentIndex {
805 #[must_use]
807 pub const fn new(sbom: std::sync::Arc<NormalizedSbom>) -> Self {
808 Self {
809 sbom: Some(sbom),
810 index: std::sync::OnceLock::new(),
811 }
812 }
813
814 #[must_use]
816 pub fn from_index(index: ComponentIndex) -> Self {
817 let lazy = Self {
818 sbom: None,
819 index: std::sync::OnceLock::new(),
820 };
821 let _ = lazy.index.set(index);
822 lazy
823 }
824
825 pub fn get(&self) -> &ComponentIndex {
830 self.index.get_or_init(|| {
831 self.sbom.as_ref().map_or_else(
832 || {
833 ComponentIndex::build(&NormalizedSbom::default())
835 },
836 |sbom| ComponentIndex::build(sbom),
837 )
838 })
839 }
840
841 pub fn is_built(&self) -> bool {
843 self.index.get().is_some()
844 }
845
846 pub fn try_get(&self) -> Option<&ComponentIndex> {
848 self.index.get()
849 }
850}
851
852impl std::ops::Deref for LazyComponentIndex {
853 type Target = ComponentIndex;
854
855 fn deref(&self) -> &Self::Target {
856 self.get()
857 }
858}
859
860#[cfg(test)]
861mod tests {
862 use super::*;
863 use crate::model::{DocumentMetadata, Ecosystem};
864
865 fn make_component(name: &str, purl: Option<&str>) -> Component {
866 let mut comp = Component::new(name.to_string(), format!("test-{}", name));
867 comp.version = Some("1.0.0".to_string());
868 comp.identifiers.purl = purl.map(|s| s.to_string());
869 comp.ecosystem = purl
871 .and_then(ComponentIndex::extract_ecosystem)
872 .map(|eco_str| Ecosystem::from_purl_type(&eco_str));
873 comp
874 }
875
876 #[test]
877 fn test_extract_ecosystem() {
878 assert_eq!(
879 ComponentIndex::extract_ecosystem("pkg:pypi/requests@2.28.0"),
880 Some("pypi".to_string())
881 );
882 assert_eq!(
883 ComponentIndex::extract_ecosystem("pkg:npm/@angular/core@14.0.0"),
884 Some("npm".to_string())
885 );
886 assert_eq!(
887 ComponentIndex::extract_ecosystem("pkg:cargo/serde@1.0.0"),
888 Some("cargo".to_string())
889 );
890 }
891
892 #[test]
893 fn test_normalize_name_pypi() {
894 assert_eq!(
895 ComponentIndex::normalize_name("Python_Dateutil", Some("pypi")),
896 "python-dateutil"
897 );
898 assert_eq!(
899 ComponentIndex::normalize_name("Some.Package", Some("pypi")),
900 "some-package"
901 );
902 }
903
904 #[test]
905 fn test_normalize_name_cargo() {
906 assert_eq!(
907 ComponentIndex::normalize_name("serde-json", Some("cargo")),
908 "serde_json"
909 );
910 }
911
912 #[test]
913 fn test_build_index() {
914 let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
915
916 let comp1 = make_component("requests", Some("pkg:pypi/requests@2.28.0"));
917 let comp2 = make_component("urllib3", Some("pkg:pypi/urllib3@1.26.0"));
918 let comp3 = make_component("serde", Some("pkg:cargo/serde@1.0.0"));
919
920 sbom.add_component(comp1);
921 sbom.add_component(comp2);
922 sbom.add_component(comp3);
923
924 let index = ComponentIndex::build(&sbom);
925
926 assert_eq!(index.len(), 3);
927 assert_eq!(index.by_ecosystem.get("pypi").map(|v| v.len()), Some(2));
928 assert_eq!(index.by_ecosystem.get("cargo").map(|v| v.len()), Some(1));
929 }
930
931 #[test]
932 fn test_find_candidates_same_ecosystem() {
933 let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
934
935 let comp1 = make_component("requests", Some("pkg:pypi/requests@2.28.0"));
936 let comp2 = make_component("urllib3", Some("pkg:pypi/urllib3@1.26.0"));
937 let comp3 = make_component("flask", Some("pkg:pypi/flask@2.0.0"));
938 let comp4 = make_component("serde", Some("pkg:cargo/serde@1.0.0"));
939
940 sbom.add_component(comp1.clone());
941 sbom.add_component(comp2);
942 sbom.add_component(comp3);
943 sbom.add_component(comp4);
944
945 let index = ComponentIndex::build(&sbom);
946
947 let requests_id = sbom
949 .components
950 .keys()
951 .find(|id| {
952 sbom.components
953 .get(*id)
954 .map(|c| c.name == "requests")
955 .unwrap_or(false)
956 })
957 .unwrap();
958
959 let entry = index.get_entry(requests_id).unwrap();
960 let candidates = index.find_candidates(requests_id, entry, 10, 5);
961
962 assert!(candidates.len() >= 2);
964 for cand_id in &candidates {
965 let cand_entry = index.get_entry(cand_id).unwrap();
966 assert_eq!(cand_entry.ecosystem, Some("pypi".to_string()));
967 }
968 }
969
970 #[test]
971 fn test_compute_trigrams() {
972 let trigrams = ComponentIndex::compute_trigrams("lodash");
974 assert_eq!(trigrams, vec!["lod", "oda", "das", "ash"]);
975
976 let trigrams = ComponentIndex::compute_trigrams("ab");
978 assert_eq!(trigrams, vec!["ab"]);
979
980 let trigrams = ComponentIndex::compute_trigrams("");
982 assert!(trigrams.is_empty());
983
984 let trigrams = ComponentIndex::compute_trigrams("abc");
986 assert_eq!(trigrams, vec!["abc"]);
987 }
988
989 #[test]
990 fn test_trigram_similarity() {
991 let entry_a = NormalizedEntry {
992 normalized_purl: None,
993 normalized_name: "lodash".to_string(),
994 name_length: 6,
995 ecosystem: None,
996 prefix: "lod".to_string(),
997 trigrams: vec![
998 "lod".to_string(),
999 "oda".to_string(),
1000 "das".to_string(),
1001 "ash".to_string(),
1002 ],
1003 };
1004
1005 let entry_b = NormalizedEntry {
1006 normalized_purl: None,
1007 normalized_name: "lodash-es".to_string(),
1008 name_length: 9,
1009 ecosystem: None,
1010 prefix: "lod".to_string(),
1011 trigrams: vec![
1012 "lod".to_string(),
1013 "oda".to_string(),
1014 "das".to_string(),
1015 "ash".to_string(),
1016 "sh-".to_string(),
1017 "h-e".to_string(),
1018 "-es".to_string(),
1019 ],
1020 };
1021
1022 let similarity = ComponentIndex::trigram_similarity(&entry_a, &entry_b);
1023 assert!(
1026 similarity > 0.5 && similarity < 0.6,
1027 "Expected ~0.57, got {}",
1028 similarity
1029 );
1030
1031 let same_similarity = ComponentIndex::trigram_similarity(&entry_a, &entry_a);
1033 assert!((same_similarity - 1.0).abs() < f64::EPSILON);
1034
1035 let entry_c = NormalizedEntry {
1037 normalized_purl: None,
1038 normalized_name: "react".to_string(),
1039 name_length: 5,
1040 ecosystem: None,
1041 prefix: "rea".to_string(),
1042 trigrams: vec!["rea".to_string(), "eac".to_string(), "act".to_string()],
1043 };
1044
1045 let diff_similarity = ComponentIndex::trigram_similarity(&entry_a, &entry_c);
1046 assert!(
1047 diff_similarity < 0.1,
1048 "Expected low similarity, got {}",
1049 diff_similarity
1050 );
1051 }
1052
1053 #[test]
1054 fn test_trigram_index_find_similar_suffix() {
1055 let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
1057
1058 let comp1 = make_component("react-dom", Some("pkg:npm/react-dom@18.0.0"));
1060 let comp2 = make_component("preact-dom", Some("pkg:npm/preact-dom@10.0.0")); let comp3 = make_component("angular", Some("pkg:npm/angular@15.0.0")); sbom.add_component(comp1.clone());
1064 sbom.add_component(comp2);
1065 sbom.add_component(comp3);
1066
1067 let index = ComponentIndex::build(&sbom);
1068
1069 let react_id = sbom
1071 .components
1072 .keys()
1073 .find(|id| {
1074 sbom.components
1075 .get(*id)
1076 .map(|c| c.name == "react-dom")
1077 .unwrap_or(false)
1078 })
1079 .unwrap();
1080
1081 let entry = index.get_entry(react_id).unwrap();
1082
1083 let candidates = index.find_candidates(react_id, entry, 10, 5);
1085
1086 let preact_found = candidates.iter().any(|id| {
1087 index
1088 .get_entry(id)
1089 .map(|e| e.normalized_name.contains("preact"))
1090 .unwrap_or(false)
1091 });
1092
1093 assert!(preact_found, "Should find preact-dom via trigram matching");
1094 }
1095}