1use crate::model::{CanonicalId, Component, NormalizedSbom};
7use rayon::prelude::*;
8use std::collections::{HashMap, HashSet};
9use std::sync::Arc;
10
11#[derive(Debug, Clone)]
13pub struct NormalizedEntry {
14 pub normalized_purl: Option<String>,
16 pub normalized_name: String,
18 pub name_length: usize,
20 pub ecosystem: Option<String>,
22 pub prefix: String,
24 pub trigrams: Vec<String>,
26}
27
28pub struct ComponentIndex {
38 by_ecosystem: HashMap<String, Vec<Arc<CanonicalId>>>,
40 by_prefix: HashMap<String, Vec<Arc<CanonicalId>>>,
42 by_trigram: HashMap<String, Vec<Arc<CanonicalId>>>,
44 entries: HashMap<Arc<CanonicalId>, NormalizedEntry>,
46 all_ids: Vec<Arc<CanonicalId>>,
48}
49
50impl ComponentIndex {
51 pub fn build(sbom: &NormalizedSbom) -> Self {
56 let mut by_ecosystem: HashMap<String, Vec<Arc<CanonicalId>>> = HashMap::new();
57 let mut by_prefix: HashMap<String, Vec<Arc<CanonicalId>>> = HashMap::new();
58 let mut by_trigram: HashMap<String, Vec<Arc<CanonicalId>>> = HashMap::new();
59 let mut entries: HashMap<Arc<CanonicalId>, NormalizedEntry> = HashMap::new();
60 let mut all_ids: Vec<Arc<CanonicalId>> = Vec::new();
61
62 for (id, comp) in &sbom.components {
63 let entry = Self::normalize_component(comp);
64 let arc_id = Arc::new(id.clone());
66
67 if let Some(ref eco) = entry.ecosystem {
69 by_ecosystem
70 .entry(eco.clone())
71 .or_default()
72 .push(Arc::clone(&arc_id));
73 }
74
75 if !entry.prefix.is_empty() {
77 by_prefix
78 .entry(entry.prefix.clone())
79 .or_default()
80 .push(Arc::clone(&arc_id));
81 }
82
83 for trigram in &entry.trigrams {
85 by_trigram
86 .entry(trigram.clone())
87 .or_default()
88 .push(Arc::clone(&arc_id));
89 }
90
91 entries.insert(Arc::clone(&arc_id), entry);
92 all_ids.push(arc_id);
93 }
94
95 Self {
96 by_ecosystem,
97 by_prefix,
98 by_trigram,
99 entries,
100 all_ids,
101 }
102 }
103
104 pub fn normalize_component(comp: &Component) -> NormalizedEntry {
106 let (ecosystem, normalized_purl) = if let Some(ref purl) = comp.identifiers.purl {
108 let eco = Self::extract_ecosystem(purl);
109 let normalized = Self::normalize_purl(purl);
110 (eco, Some(normalized))
111 } else {
112 (comp.ecosystem.as_ref().map(|e| e.to_string()), None)
115 };
116
117 let normalized_name = Self::normalize_name(&comp.name, ecosystem.as_deref());
119 let name_length = normalized_name.len();
120 let prefix = normalized_name.chars().take(3).collect::<String>();
121 let trigrams = Self::compute_trigrams(&normalized_name);
122
123 NormalizedEntry {
124 normalized_purl,
125 normalized_name,
126 name_length,
127 ecosystem,
128 prefix,
129 trigrams,
130 }
131 }
132
133 fn compute_trigrams(name: &str) -> Vec<String> {
138 if name.len() < 3 {
139 return if name.is_empty() {
141 vec![]
142 } else {
143 vec![name.to_string()]
144 };
145 }
146
147 if name.is_ascii() {
150 return name
151 .as_bytes()
152 .windows(3)
153 .map(|w| {
154 unsafe { std::str::from_utf8_unchecked(w) }.to_string()
157 })
158 .collect();
159 }
160
161 let chars: Vec<char> = name.chars().collect();
163 if chars.len() < 3 {
164 return vec![name.to_string()];
165 }
166
167 chars
168 .windows(3)
169 .map(|w| w.iter().collect::<String>())
170 .collect()
171 }
172
173 fn extract_ecosystem(purl: &str) -> Option<String> {
175 if let Some(rest) = purl.strip_prefix("pkg:") {
177 if let Some(slash_pos) = rest.find('/') {
178 return Some(rest[..slash_pos].to_lowercase());
179 }
180 }
181 None
182 }
183
184 fn normalize_purl(purl: &str) -> String {
186 let purl_lower = purl.to_lowercase();
188 if let Some(at_pos) = purl_lower.rfind('@') {
190 purl_lower[..at_pos].to_string()
191 } else {
192 purl_lower
193 }
194 }
195
196 pub fn normalize_name(name: &str, ecosystem: Option<&str>) -> String {
206 let mut normalized = name.to_lowercase();
207
208 match ecosystem {
210 Some("pypi") => {
211 normalized = normalized.replace(['_', '.'], "-");
213 }
214 Some("cargo") => {
215 normalized = normalized.replace('-', "_");
217 }
218 Some("npm") => {
219 }
222 _ => {
223 normalized = normalized.replace('_', "-");
225 }
226 }
227
228 while normalized.contains("--") {
230 normalized = normalized.replace("--", "-");
231 }
232
233 normalized
234 }
235
236 pub fn get_entry(&self, id: &CanonicalId) -> Option<&NormalizedEntry> {
238 self.entries.get(id)
240 }
241
242 pub fn get_by_ecosystem(&self, ecosystem: &str) -> Option<Vec<CanonicalId>> {
247 self.by_ecosystem
248 .get(ecosystem)
249 .map(|v| v.iter().map(|arc| (**arc).clone()).collect())
250 }
251
252 pub fn find_candidates(
260 &self,
261 source_id: &CanonicalId,
262 source_entry: &NormalizedEntry,
263 max_candidates: usize,
264 max_length_diff: usize,
265 ) -> Vec<CanonicalId> {
266 let mut candidates: Vec<Arc<CanonicalId>> = Vec::new();
267 let mut seen: HashSet<Arc<CanonicalId>> = HashSet::new();
268
269 if let Some(ref eco) = source_entry.ecosystem {
271 if let Some(ids) = self.by_ecosystem.get(eco) {
272 for id in ids {
273 if id.as_ref() != source_id && !seen.contains(id) {
274 if let Some(entry) = self.entries.get(id.as_ref()) {
276 let len_diff = (source_entry.name_length as i32
277 - entry.name_length as i32)
278 .unsigned_abs() as usize;
279 if len_diff <= max_length_diff {
280 candidates.push(Arc::clone(id));
281 seen.insert(Arc::clone(id));
282 }
283 }
284 }
285 }
286 }
287 }
288
289 if candidates.len() < max_candidates && !source_entry.prefix.is_empty() {
291 if let Some(ids) = self.by_prefix.get(&source_entry.prefix) {
292 for id in ids {
293 if id.as_ref() != source_id && !seen.contains(id) {
294 if let Some(entry) = self.entries.get(id.as_ref()) {
295 let len_diff = (source_entry.name_length as i32
296 - entry.name_length as i32)
297 .unsigned_abs() as usize;
298 if len_diff <= max_length_diff {
299 candidates.push(Arc::clone(id));
300 seen.insert(Arc::clone(id));
301 }
302 }
303 }
304 if candidates.len() >= max_candidates {
305 break;
306 }
307 }
308 }
309 }
310
311 if candidates.len() < max_candidates && source_entry.prefix.len() >= 2 {
313 let prefix_2 = &source_entry.prefix[..2.min(source_entry.prefix.len())];
314 for (prefix, ids) in &self.by_prefix {
315 if prefix.starts_with(prefix_2) && prefix != &source_entry.prefix {
316 for id in ids {
317 if id.as_ref() != source_id && !seen.contains(id) {
318 if let Some(entry) = self.entries.get(id.as_ref()) {
319 let len_diff = (source_entry.name_length as i32
320 - entry.name_length as i32)
321 .unsigned_abs()
322 as usize;
323 if len_diff <= max_length_diff {
324 candidates.push(Arc::clone(id));
325 seen.insert(Arc::clone(id));
326 }
327 }
328 }
329 if candidates.len() >= max_candidates {
330 break;
331 }
332 }
333 }
334 if candidates.len() >= max_candidates {
335 break;
336 }
337 }
338 }
339
340 if candidates.len() < max_candidates && !source_entry.trigrams.is_empty() {
343 let mut trigram_scores: HashMap<Arc<CanonicalId>, usize> = HashMap::new();
345
346 for trigram in &source_entry.trigrams {
347 if let Some(ids) = self.by_trigram.get(trigram) {
348 for id in ids {
349 if id.as_ref() != source_id && !seen.contains(id) {
350 *trigram_scores.entry(Arc::clone(id)).or_default() += 1;
351 }
352 }
353 }
354 }
355
356 let min_shared = if source_entry.trigrams.len() <= 2 {
358 1
359 } else {
360 2
361 };
362
363 let mut scored: Vec<_> = trigram_scores
365 .into_iter()
366 .filter(|(_, count)| *count >= min_shared)
367 .collect();
368 scored.sort_by(|a, b| b.1.cmp(&a.1));
369
370 for (id, _score) in scored {
371 if candidates.len() >= max_candidates {
372 break;
373 }
374 if let Some(entry) = self.entries.get(id.as_ref()) {
375 let len_diff = (source_entry.name_length as i32 - entry.name_length as i32)
376 .unsigned_abs() as usize;
377 if len_diff <= max_length_diff {
378 candidates.push(Arc::clone(&id));
379 seen.insert(id);
380 }
381 }
382 }
383 }
384
385 candidates.truncate(max_candidates);
387 candidates.into_iter().map(|arc| (*arc).clone()).collect()
388 }
389
390 pub fn all_ids(&self) -> Vec<CanonicalId> {
394 self.all_ids.iter().map(|arc| (**arc).clone()).collect()
395 }
396
397 pub fn len(&self) -> usize {
399 self.entries.len()
400 }
401
402 pub fn is_empty(&self) -> bool {
404 self.entries.is_empty()
405 }
406
407 pub fn find_candidates_parallel<'a>(
414 &self,
415 sources: &[(&'a CanonicalId, &NormalizedEntry)],
416 max_candidates: usize,
417 max_length_diff: usize,
418 ) -> Vec<(&'a CanonicalId, Vec<CanonicalId>)> {
419 sources
420 .par_iter()
421 .map(|(source_id, source_entry)| {
422 let candidates =
423 self.find_candidates(source_id, source_entry, max_candidates, max_length_diff);
424 (*source_id, candidates)
425 })
426 .collect()
427 }
428
429 pub fn find_all_candidates_from(
434 &self,
435 other: &ComponentIndex,
436 max_candidates: usize,
437 max_length_diff: usize,
438 ) -> Vec<(CanonicalId, Vec<CanonicalId>)> {
439 let sources: Vec<_> = other
440 .entries
441 .iter()
442 .collect();
443
444 sources
445 .par_iter()
446 .map(|(source_id, source_entry)| {
447 let candidates =
448 self.find_candidates(source_id, source_entry, max_candidates, max_length_diff);
449 ((*source_id).as_ref().clone(), candidates)
451 })
452 .collect::<Vec<_>>()
453 }
454
455 pub fn stats(&self) -> IndexStats {
457 let ecosystems = self.by_ecosystem.len();
458 let prefixes = self.by_prefix.len();
459 let trigrams = self.by_trigram.len();
460 let avg_per_ecosystem = if ecosystems > 0 {
461 self.by_ecosystem.values().map(|v| v.len()).sum::<usize>() / ecosystems
462 } else {
463 0
464 };
465 let avg_per_prefix = if prefixes > 0 {
466 self.by_prefix.values().map(|v| v.len()).sum::<usize>() / prefixes
467 } else {
468 0
469 };
470 let avg_per_trigram = if trigrams > 0 {
471 self.by_trigram.values().map(|v| v.len()).sum::<usize>() / trigrams
472 } else {
473 0
474 };
475
476 IndexStats {
477 total_components: self.entries.len(),
478 ecosystems,
479 prefixes,
480 trigrams,
481 avg_per_ecosystem,
482 avg_per_prefix,
483 avg_per_trigram,
484 }
485 }
486
487 pub fn trigram_similarity(entry_a: &NormalizedEntry, entry_b: &NormalizedEntry) -> f64 {
491 if entry_a.trigrams.is_empty() || entry_b.trigrams.is_empty() {
492 return 0.0;
493 }
494
495 let set_a: HashSet<_> = entry_a.trigrams.iter().collect();
496 let set_b: HashSet<_> = entry_b.trigrams.iter().collect();
497
498 let intersection = set_a.intersection(&set_b).count();
499 let union = set_a.union(&set_b).count();
500
501 if union == 0 {
502 0.0
503 } else {
504 intersection as f64 / union as f64
505 }
506 }
507}
508
509#[derive(Debug, Clone)]
511pub struct IndexStats {
512 pub total_components: usize,
514 pub ecosystems: usize,
516 pub prefixes: usize,
518 pub trigrams: usize,
520 pub avg_per_ecosystem: usize,
522 pub avg_per_prefix: usize,
524 pub avg_per_trigram: usize,
526}
527
528pub struct BatchCandidateGenerator {
537 component_index: ComponentIndex,
539 lsh_index: Option<super::lsh::LshIndex>,
541 cross_ecosystem_db: Option<super::cross_ecosystem::CrossEcosystemDb>,
543 config: BatchCandidateConfig,
545}
546
547#[derive(Debug, Clone)]
549pub struct BatchCandidateConfig {
550 pub max_candidates: usize,
552 pub max_length_diff: usize,
554 pub lsh_threshold: usize,
556 pub enable_cross_ecosystem: bool,
558}
559
560impl Default for BatchCandidateConfig {
561 fn default() -> Self {
562 Self {
563 max_candidates: 100,
564 max_length_diff: 5,
565 lsh_threshold: 500, enable_cross_ecosystem: true,
567 }
568 }
569}
570
571#[derive(Debug)]
573pub struct BatchCandidateResult {
574 pub source_id: CanonicalId,
576 pub index_candidates: Vec<CanonicalId>,
578 pub lsh_candidates: Vec<CanonicalId>,
580 pub cross_ecosystem_candidates: Vec<CanonicalId>,
582 pub total_unique: usize,
584}
585
586impl BatchCandidateGenerator {
587 pub fn build(sbom: &NormalizedSbom, config: BatchCandidateConfig) -> Self {
589 let component_index = ComponentIndex::build(sbom);
590
591 let lsh_index = if sbom.component_count() >= config.lsh_threshold {
593 Some(super::lsh::LshIndex::build(
594 sbom,
595 super::lsh::LshConfig::default(),
596 ))
597 } else {
598 None
599 };
600
601 let cross_ecosystem_db = if config.enable_cross_ecosystem {
603 Some(super::cross_ecosystem::CrossEcosystemDb::with_builtin_mappings())
604 } else {
605 None
606 };
607
608 Self {
609 component_index,
610 lsh_index,
611 cross_ecosystem_db,
612 config,
613 }
614 }
615
616 pub fn find_candidates(
618 &self,
619 source_id: &CanonicalId,
620 source_component: &Component,
621 ) -> BatchCandidateResult {
622 let mut seen: HashSet<CanonicalId> = HashSet::new();
623
624 let source_entry = if let Some(entry) = self.component_index.get_entry(source_id) {
626 entry.clone()
627 } else {
628 ComponentIndex::normalize_component(source_component)
630 };
631
632 let index_candidates = self.component_index.find_candidates(
634 source_id,
635 &source_entry,
636 self.config.max_candidates,
637 self.config.max_length_diff,
638 );
639 for id in &index_candidates {
640 seen.insert(id.clone());
641 }
642
643 let lsh_candidates: Vec<CanonicalId> = if let Some(ref lsh) = self.lsh_index {
645 let candidates: Vec<_> = lsh
646 .find_candidates(source_component)
647 .into_iter()
648 .filter(|id| id != source_id && !seen.contains(id))
649 .take(self.config.max_candidates / 2) .collect();
651 for id in &candidates {
652 seen.insert(id.clone());
653 }
654 candidates
655 } else {
656 Vec::new()
657 };
658
659 let cross_ecosystem_candidates: Vec<CanonicalId> =
661 if let (Some(ref db), Some(ref eco)) =
662 (&self.cross_ecosystem_db, &source_component.ecosystem)
663 {
664 let candidates: Vec<_> = db
665 .find_equivalents(eco, &source_component.name)
666 .into_iter()
667 .flat_map(|m| {
668 let target_eco_str = m.target_ecosystem.to_string().to_lowercase();
670 self.component_index
671 .get_by_ecosystem(&target_eco_str)
672 .unwrap_or_default()
673 })
674 .filter(|id| id != source_id && !seen.contains(id))
675 .take(self.config.max_candidates / 4) .collect();
677 for id in &candidates {
678 seen.insert(id.clone());
679 }
680 candidates
681 } else {
682 Vec::new()
683 };
684
685 let total_unique = seen.len();
686
687 BatchCandidateResult {
688 source_id: source_id.clone(),
689 index_candidates,
690 lsh_candidates,
691 cross_ecosystem_candidates,
692 total_unique,
693 }
694 }
695
696 pub fn find_candidates_batch(
698 &self,
699 sources: &[(&CanonicalId, &Component)],
700 ) -> Vec<BatchCandidateResult> {
701 sources
702 .par_iter()
703 .map(|(id, comp)| self.find_candidates(id, comp))
704 .collect()
705 }
706
707 pub fn all_candidates(
709 &self,
710 source_id: &CanonicalId,
711 source_component: &Component,
712 ) -> Vec<CanonicalId> {
713 let result = self.find_candidates(source_id, source_component);
714 let mut all: Vec<_> = result.index_candidates;
715 all.extend(result.lsh_candidates);
716 all.extend(result.cross_ecosystem_candidates);
717 all
718 }
719
720 pub fn component_index(&self) -> &ComponentIndex {
722 &self.component_index
723 }
724
725 pub fn has_lsh(&self) -> bool {
727 self.lsh_index.is_some()
728 }
729
730 pub fn has_cross_ecosystem(&self) -> bool {
732 self.cross_ecosystem_db.is_some()
733 }
734
735 pub fn stats(&self) -> BatchCandidateStats {
737 BatchCandidateStats {
738 index_stats: self.component_index.stats(),
739 lsh_enabled: self.lsh_index.is_some(),
740 lsh_stats: self.lsh_index.as_ref().map(|l| l.stats()),
741 cross_ecosystem_enabled: self.cross_ecosystem_db.is_some(),
742 }
743 }
744}
745
746#[derive(Debug)]
748pub struct BatchCandidateStats {
749 pub index_stats: IndexStats,
751 pub lsh_enabled: bool,
753 pub lsh_stats: Option<super::lsh::LshIndexStats>,
755 pub cross_ecosystem_enabled: bool,
757}
758
759pub struct LazyComponentIndex {
764 sbom: Option<std::sync::Arc<NormalizedSbom>>,
766 index: std::sync::OnceLock<ComponentIndex>,
768}
769
770impl LazyComponentIndex {
771 pub fn new(sbom: std::sync::Arc<NormalizedSbom>) -> Self {
773 Self {
774 sbom: Some(sbom),
775 index: std::sync::OnceLock::new(),
776 }
777 }
778
779 pub fn from_index(index: ComponentIndex) -> Self {
781 let lazy = Self {
782 sbom: None,
783 index: std::sync::OnceLock::new(),
784 };
785 let _ = lazy.index.set(index);
786 lazy
787 }
788
789 pub fn get(&self) -> &ComponentIndex {
794 self.index.get_or_init(|| {
795 if let Some(ref sbom) = self.sbom {
796 ComponentIndex::build(sbom)
797 } else {
798 ComponentIndex::build(&NormalizedSbom::default())
800 }
801 })
802 }
803
804 pub fn is_built(&self) -> bool {
806 self.index.get().is_some()
807 }
808
809 pub fn try_get(&self) -> Option<&ComponentIndex> {
811 self.index.get()
812 }
813}
814
815impl std::ops::Deref for LazyComponentIndex {
816 type Target = ComponentIndex;
817
818 fn deref(&self) -> &Self::Target {
819 self.get()
820 }
821}
822
823#[cfg(test)]
824mod tests {
825 use super::*;
826 use crate::model::{DocumentMetadata, Ecosystem};
827
828 fn make_component(name: &str, purl: Option<&str>) -> Component {
829 let mut comp = Component::new(name.to_string(), format!("test-{}", name));
830 comp.version = Some("1.0.0".to_string());
831 comp.identifiers.purl = purl.map(|s| s.to_string());
832 comp.ecosystem = purl
834 .and_then(|p| ComponentIndex::extract_ecosystem(p))
835 .map(|eco_str| Ecosystem::from_purl_type(&eco_str));
836 comp
837 }
838
839 #[test]
840 fn test_extract_ecosystem() {
841 assert_eq!(
842 ComponentIndex::extract_ecosystem("pkg:pypi/requests@2.28.0"),
843 Some("pypi".to_string())
844 );
845 assert_eq!(
846 ComponentIndex::extract_ecosystem("pkg:npm/@angular/core@14.0.0"),
847 Some("npm".to_string())
848 );
849 assert_eq!(
850 ComponentIndex::extract_ecosystem("pkg:cargo/serde@1.0.0"),
851 Some("cargo".to_string())
852 );
853 }
854
855 #[test]
856 fn test_normalize_name_pypi() {
857 assert_eq!(
858 ComponentIndex::normalize_name("Python_Dateutil", Some("pypi")),
859 "python-dateutil"
860 );
861 assert_eq!(
862 ComponentIndex::normalize_name("Some.Package", Some("pypi")),
863 "some-package"
864 );
865 }
866
867 #[test]
868 fn test_normalize_name_cargo() {
869 assert_eq!(
870 ComponentIndex::normalize_name("serde-json", Some("cargo")),
871 "serde_json"
872 );
873 }
874
875 #[test]
876 fn test_build_index() {
877 let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
878
879 let comp1 = make_component("requests", Some("pkg:pypi/requests@2.28.0"));
880 let comp2 = make_component("urllib3", Some("pkg:pypi/urllib3@1.26.0"));
881 let comp3 = make_component("serde", Some("pkg:cargo/serde@1.0.0"));
882
883 sbom.add_component(comp1);
884 sbom.add_component(comp2);
885 sbom.add_component(comp3);
886
887 let index = ComponentIndex::build(&sbom);
888
889 assert_eq!(index.len(), 3);
890 assert_eq!(index.by_ecosystem.get("pypi").map(|v| v.len()), Some(2));
891 assert_eq!(index.by_ecosystem.get("cargo").map(|v| v.len()), Some(1));
892 }
893
894 #[test]
895 fn test_find_candidates_same_ecosystem() {
896 let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
897
898 let comp1 = make_component("requests", Some("pkg:pypi/requests@2.28.0"));
899 let comp2 = make_component("urllib3", Some("pkg:pypi/urllib3@1.26.0"));
900 let comp3 = make_component("flask", Some("pkg:pypi/flask@2.0.0"));
901 let comp4 = make_component("serde", Some("pkg:cargo/serde@1.0.0"));
902
903 sbom.add_component(comp1.clone());
904 sbom.add_component(comp2);
905 sbom.add_component(comp3);
906 sbom.add_component(comp4);
907
908 let index = ComponentIndex::build(&sbom);
909
910 let requests_id = sbom
912 .components
913 .keys()
914 .find(|id| {
915 sbom.components
916 .get(*id)
917 .map(|c| c.name == "requests")
918 .unwrap_or(false)
919 })
920 .unwrap();
921
922 let entry = index.get_entry(requests_id).unwrap();
923 let candidates = index.find_candidates(requests_id, entry, 10, 5);
924
925 assert!(candidates.len() >= 2);
927 for cand_id in &candidates {
928 let cand_entry = index.get_entry(cand_id).unwrap();
929 assert_eq!(cand_entry.ecosystem, Some("pypi".to_string()));
930 }
931 }
932
933 #[test]
934 fn test_compute_trigrams() {
935 let trigrams = ComponentIndex::compute_trigrams("lodash");
937 assert_eq!(trigrams, vec!["lod", "oda", "das", "ash"]);
938
939 let trigrams = ComponentIndex::compute_trigrams("ab");
941 assert_eq!(trigrams, vec!["ab"]);
942
943 let trigrams = ComponentIndex::compute_trigrams("");
945 assert!(trigrams.is_empty());
946
947 let trigrams = ComponentIndex::compute_trigrams("abc");
949 assert_eq!(trigrams, vec!["abc"]);
950 }
951
952 #[test]
953 fn test_trigram_similarity() {
954 let entry_a = NormalizedEntry {
955 normalized_purl: None,
956 normalized_name: "lodash".to_string(),
957 name_length: 6,
958 ecosystem: None,
959 prefix: "lod".to_string(),
960 trigrams: vec![
961 "lod".to_string(),
962 "oda".to_string(),
963 "das".to_string(),
964 "ash".to_string(),
965 ],
966 };
967
968 let entry_b = NormalizedEntry {
969 normalized_purl: None,
970 normalized_name: "lodash-es".to_string(),
971 name_length: 9,
972 ecosystem: None,
973 prefix: "lod".to_string(),
974 trigrams: vec![
975 "lod".to_string(),
976 "oda".to_string(),
977 "das".to_string(),
978 "ash".to_string(),
979 "sh-".to_string(),
980 "h-e".to_string(),
981 "-es".to_string(),
982 ],
983 };
984
985 let similarity = ComponentIndex::trigram_similarity(&entry_a, &entry_b);
986 assert!(
989 similarity > 0.5 && similarity < 0.6,
990 "Expected ~0.57, got {}",
991 similarity
992 );
993
994 let same_similarity = ComponentIndex::trigram_similarity(&entry_a, &entry_a);
996 assert!((same_similarity - 1.0).abs() < f64::EPSILON);
997
998 let entry_c = NormalizedEntry {
1000 normalized_purl: None,
1001 normalized_name: "react".to_string(),
1002 name_length: 5,
1003 ecosystem: None,
1004 prefix: "rea".to_string(),
1005 trigrams: vec!["rea".to_string(), "eac".to_string(), "act".to_string()],
1006 };
1007
1008 let diff_similarity = ComponentIndex::trigram_similarity(&entry_a, &entry_c);
1009 assert!(
1010 diff_similarity < 0.1,
1011 "Expected low similarity, got {}",
1012 diff_similarity
1013 );
1014 }
1015
1016 #[test]
1017 fn test_trigram_index_find_similar_suffix() {
1018 let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
1020
1021 let comp1 = make_component("react-dom", Some("pkg:npm/react-dom@18.0.0"));
1023 let comp2 = make_component("preact-dom", Some("pkg:npm/preact-dom@10.0.0")); let comp3 = make_component("angular", Some("pkg:npm/angular@15.0.0")); sbom.add_component(comp1.clone());
1027 sbom.add_component(comp2);
1028 sbom.add_component(comp3);
1029
1030 let index = ComponentIndex::build(&sbom);
1031
1032 let react_id = sbom
1034 .components
1035 .keys()
1036 .find(|id| {
1037 sbom.components
1038 .get(*id)
1039 .map(|c| c.name == "react-dom")
1040 .unwrap_or(false)
1041 })
1042 .unwrap();
1043
1044 let entry = index.get_entry(react_id).unwrap();
1045
1046 let candidates = index.find_candidates(react_id, entry, 10, 5);
1048
1049 let preact_found = candidates.iter().any(|id| {
1050 index
1051 .get_entry(id)
1052 .map(|e| e.normalized_name.contains("preact"))
1053 .unwrap_or(false)
1054 });
1055
1056 assert!(preact_found, "Should find preact-dom via trigram matching");
1057 }
1058}