1use super::{AnnotationStorage, EdgeAnnotationStorage, Match, NodeAnnotationStorage};
2use crate::annostorage::ValueSearch;
3use crate::errors::Result;
4use crate::graph::NODE_NAME_KEY;
5use crate::types::{AnnoKey, Annotation, Edge, NodeID};
6use crate::util::{self};
7use crate::{annostorage::symboltable::SymbolTable, errors::GraphAnnisCoreError};
8use core::ops::Bound::*;
9use itertools::Itertools;
10use rand::seq::IteratorRandom;
11use rustc_hash::FxHashSet;
12use std::borrow::Cow;
13use std::collections::{BTreeMap, BTreeSet, HashMap};
14use std::hash::Hash;
15use std::path::Path;
16use std::sync::Arc;
17
18#[derive(Serialize, Deserialize, Clone, Debug, Default, Copy)]
19struct SparseAnnotation {
20 key: usize,
21 val: usize,
22}
23
24type ValueItemMap<T> = HashMap<usize, BTreeSet<T>>;
25
26#[derive(Serialize, Deserialize, Clone, Default)]
27pub struct AnnoStorageImpl<T: Ord + Hash + Default> {
28 by_container: HashMap<T, Vec<SparseAnnotation>>,
29 by_anno: HashMap<usize, ValueItemMap<T>>,
31 anno_key_sizes: BTreeMap<AnnoKey, usize>,
33 anno_keys: SymbolTable<AnnoKey>,
34 anno_values: SymbolTable<String>,
35
36 histogram_bounds: BTreeMap<usize, Vec<String>>,
39 largest_item: Option<T>,
40 total_number_of_annos: usize,
41}
42
43impl<T: Ord + Hash + Clone + serde::Serialize + serde::de::DeserializeOwned + Default>
44 AnnoStorageImpl<T>
45{
46 pub fn new() -> AnnoStorageImpl<T> {
47 AnnoStorageImpl {
48 by_container: HashMap::default(),
49 by_anno: HashMap::default(),
50 anno_keys: SymbolTable::new(),
51 anno_values: SymbolTable::new(),
52 anno_key_sizes: BTreeMap::new(),
53 histogram_bounds: BTreeMap::new(),
54 largest_item: None,
55 total_number_of_annos: 0,
56 }
57 }
58
59 fn clear_internal(&mut self) {
60 self.by_container.clear();
61 self.by_anno.clear();
62 self.anno_keys.clear();
63 self.anno_key_sizes.clear();
64 self.histogram_bounds.clear();
65 self.largest_item = None;
66 self.anno_values.clear();
67 }
68
69 fn create_sparse_anno(&mut self, orig: Annotation) -> Result<SparseAnnotation> {
70 let key = self.anno_keys.insert(orig.key)?;
71 let val = self.anno_values.insert(orig.val)?;
72
73 Ok(SparseAnnotation { key, val })
74 }
75
76 fn create_annotation_from_sparse(&self, orig: &SparseAnnotation) -> Option<Annotation> {
77 let key = self.anno_keys.get_value_ref(orig.key)?;
78 let val = self.anno_values.get_value_ref(orig.val)?;
79
80 Some(Annotation {
81 key: key.clone(),
82 val: val.clone(),
83 })
84 }
85
86 fn remove_element_from_by_anno(&mut self, anno: &SparseAnnotation, item: &T) {
87 let remove_anno_key = if let Some(annos_for_key) = self.by_anno.get_mut(&anno.key) {
88 let remove_anno_val = if let Some(items_for_anno) = annos_for_key.get_mut(&anno.val) {
89 items_for_anno.remove(item);
90 items_for_anno.is_empty()
91 } else {
92 false
93 };
94 if remove_anno_val {
96 annos_for_key.remove(&anno.val);
97 annos_for_key.is_empty()
98 } else {
99 false
100 }
101 } else {
102 false
103 };
104 if remove_anno_key {
105 self.by_anno.remove(&anno.key);
106 }
108 }
109
110 fn check_and_remove_value_symbol(&mut self, value_id: usize) {
111 let mut still_used = false;
112 for values in self.by_anno.values() {
113 if values.contains_key(&value_id) {
114 still_used = true;
115 break;
116 }
117 }
118 if !still_used {
119 self.anno_values.remove(value_id);
120 }
121 }
122}
123
124impl<T> AnnoStorageImpl<T>
125where
126 T: Ord + Hash + Default + Clone + serde::Serialize + serde::de::DeserializeOwned + Send + Sync,
127 (T, Arc<AnnoKey>): Into<Match>,
128{
129 fn matching_items<'a>(
130 &'a self,
131 namespace: Option<&str>,
132 name: &str,
133 value: Option<&str>,
134 ) -> Box<dyn Iterator<Item = Result<(T, Arc<AnnoKey>)>> + 'a> {
135 let key_ranges: Vec<Arc<AnnoKey>> = if let Some(ns) = namespace {
136 vec![Arc::from(AnnoKey {
137 ns: ns.into(),
138 name: name.into(),
139 })]
140 } else {
141 let qnames = match self.get_qnames(name) {
142 Ok(qnames) => qnames,
143 Err(e) => return Box::new(std::iter::once(Err(e))),
144 };
145 qnames.into_iter().map(Arc::from).collect()
146 };
147 let value_maps: Vec<(Arc<AnnoKey>, &ValueItemMap<T>)> = key_ranges
150 .into_iter()
151 .filter_map(|key| {
152 let key_id = self.anno_keys.get_symbol(&key)?;
153 self.by_anno
154 .get(&key_id)
155 .map(|values_for_key| (key, values_for_key))
156 })
157 .collect();
158
159 if let Some(value) = value {
160 let target_value_symbol = self.anno_values.get_symbol(&value.into());
161
162 if let Some(target_value_symbol) = target_value_symbol {
163 let it = value_maps
164 .into_iter()
165 .filter_map(move |(key, values)| {
167 values.get(&target_value_symbol).map(|items| (items, key))
168 })
169 .flat_map(|(items, key)| items.iter().cloned().zip(std::iter::repeat(key)))
171 .map(Ok);
172 Box::new(it)
173 } else {
174 Box::new(std::iter::empty())
176 }
177 } else {
178 let it = value_maps
179 .into_iter()
180 .flat_map(|(key, values)| {
182 values
183 .iter()
184 .flat_map(|(_, items)| items.iter().cloned())
185 .zip(std::iter::repeat(key))
186 })
187 .map(Ok);
188 Box::new(it)
189 }
190 }
191}
192
193impl<T> AnnotationStorage<T> for AnnoStorageImpl<T>
194where
195 T: Ord + Hash + Default + Clone + Send + Sync + serde::Serialize + serde::de::DeserializeOwned,
196 (T, Arc<AnnoKey>): Into<Match>,
197{
198 fn insert(&mut self, item: T, anno: Annotation) -> Result<()> {
199 let orig_anno_key = anno.key.clone();
200 let anno = self.create_sparse_anno(anno)?;
201
202 let existing_anno = {
203 let existing_item_entry = self.by_container.entry(item.clone()).or_default();
204
205 let existing_entry_idx = existing_item_entry.binary_search_by_key(&anno.key, |a| a.key);
207
208 if let Ok(existing_entry_idx) = existing_entry_idx {
209 let orig_anno = existing_item_entry[existing_entry_idx];
210 if orig_anno.val == anno.val {
212 return Ok(());
213 }
214 existing_item_entry[existing_entry_idx] = anno;
216 Some(orig_anno)
217 } else if let Err(insertion_idx) = existing_entry_idx {
218 existing_item_entry.insert(insertion_idx, anno);
220 None
221 } else {
222 None
223 }
224 };
225
226 if let Some(existing_anno) = &existing_anno {
227 self.remove_element_from_by_anno(existing_anno, &item);
229 }
230
231 let item_list_for_value = self
234 .by_anno
235 .entry(anno.key)
236 .or_default()
237 .entry(anno.val)
238 .or_default();
239 item_list_for_value.insert(item.clone());
240
241 if existing_anno.is_none() {
242 self.total_number_of_annos += 1;
244
245 if let Some(largest_item) = self.largest_item.clone() {
246 if largest_item < item {
247 self.largest_item = Some(item);
248 }
249 } else {
250 self.largest_item = Some(item);
251 }
252
253 let anno_key_entry = self.anno_key_sizes.entry(orig_anno_key).or_insert(0);
254 *anno_key_entry += 1;
255 }
256
257 Ok(())
258 }
259
260 fn remove_annotation_for_item(
261 &mut self,
262 item: &T,
263 key: &AnnoKey,
264 ) -> Result<Option<Cow<'_, str>>> {
265 let mut result = None;
266
267 let orig_key = key;
268 if let Some(key) = self.anno_keys.get_symbol(key)
269 && let Some(mut all_annos) = self.by_container.remove(item)
270 {
271 let anno_idx = all_annos.binary_search_by_key(&key, |a| a.key);
273
274 if let Ok(anno_idx) = anno_idx {
275 self.remove_element_from_by_anno(&all_annos[anno_idx], item);
277
278 let old_value = all_annos[anno_idx].val;
279
280 all_annos.remove(anno_idx);
282
283 let new_key_count: usize =
285 if let Some(num_of_keys) = self.anno_key_sizes.get_mut(orig_key) {
286 *num_of_keys -= 1;
287 *num_of_keys
288 } else {
289 0
290 };
291 if new_key_count == 0 {
293 self.by_anno.remove(&key);
294 self.anno_key_sizes.remove(orig_key);
295 self.anno_keys.remove(key);
296 }
297
298 result = self
299 .anno_values
300 .get_value_ref(old_value)
301 .map(|v| Cow::Owned(v.clone()));
302
303 self.check_and_remove_value_symbol(old_value);
304 self.total_number_of_annos -= 1;
305 }
306 if !all_annos.is_empty() {
308 self.by_container.insert(item.clone(), all_annos);
309 }
310 }
311
312 Ok(result)
313 }
314
315 fn remove_item(&mut self, item: &T) -> Result<bool> {
316 let mut result = false;
317
318 if let Some(all_annos) = self.by_container.remove(item) {
319 for anno in all_annos {
320 self.remove_element_from_by_anno(&anno, item);
322
323 if let Some(resolved_key) = self.anno_keys.get_value(anno.key) {
324 let new_key_count: usize =
326 if let Some(num_of_keys) = self.anno_key_sizes.get_mut(&resolved_key) {
327 *num_of_keys -= 1;
328 *num_of_keys
329 } else {
330 0
331 };
332 if new_key_count == 0 {
334 self.by_anno.remove(&anno.key);
335 self.anno_key_sizes.remove(&resolved_key);
336 self.anno_keys.remove(anno.key);
337 }
338 }
339
340 result = true;
341 self.check_and_remove_value_symbol(anno.val);
342 self.total_number_of_annos -= 1;
343 }
344 }
345
346 Ok(result)
347 }
348
349 fn clear(&mut self) -> Result<()> {
350 self.clear_internal();
351 Ok(())
352 }
353
354 fn get_qnames(&self, name: &str) -> Result<Vec<AnnoKey>> {
355 let it = self.anno_key_sizes.range(
356 AnnoKey {
357 name: name.into(),
358 ns: String::default(),
359 }..,
360 );
361 let mut result: Vec<AnnoKey> = Vec::default();
362 for (k, _) in it {
363 if k.name == name {
364 result.push(k.clone());
365 } else {
366 break;
367 }
368 }
369 Ok(result)
370 }
371
372 fn get_annotations_for_item(&self, item: &T) -> Result<Vec<Annotation>> {
373 if let Some(all_annos) = self.by_container.get(item) {
374 let mut result: Vec<Annotation> = Vec::with_capacity(all_annos.len());
375 for a in all_annos.iter() {
376 if let Some(a) = self.create_annotation_from_sparse(a) {
377 result.push(a);
378 }
379 }
380 return Ok(result);
381 }
382 Ok(Vec::new())
384 }
385
386 fn number_of_annotations(&self) -> Result<usize> {
387 Ok(self.total_number_of_annos)
388 }
389
390 fn is_empty(&self) -> Result<bool> {
391 Ok(self.total_number_of_annos == 0)
392 }
393
394 fn get_value_for_item(&self, item: &T, key: &AnnoKey) -> Result<Option<Cow<'_, str>>> {
395 if let (Some(key_symbol), Some(all_annos)) =
396 (self.anno_keys.get_symbol(key), self.by_container.get(item))
397 {
398 let idx = all_annos.binary_search_by_key(&key_symbol, |a| a.key);
399 if let Ok(idx) = idx
400 && let Some(val) = self.anno_values.get_value_ref(all_annos[idx].val)
401 {
402 return Ok(Some(Cow::Borrowed(val)));
403 }
404 }
405 Ok(None)
406 }
407
408 fn has_value_for_item(&self, item: &T, key: &AnnoKey) -> Result<bool> {
409 if let Some(key_symbol) = self.anno_keys.get_symbol(key)
410 && let Some(all_annos) = self.by_container.get(item)
411 && all_annos
412 .binary_search_by_key(&key_symbol, |a| a.key)
413 .is_ok()
414 {
415 return Ok(true);
416 }
417 Ok(false)
418 }
419
420 fn get_keys_for_iterator<'b>(
421 &'b self,
422 ns: Option<&str>,
423 name: Option<&str>,
424 it: Box<
425 dyn Iterator<Item = std::result::Result<T, Box<dyn std::error::Error + Send + Sync>>>
426 + 'b,
427 >,
428 ) -> Result<Vec<Match>> {
429 if let Some(name) = name {
430 if let Some(ns) = ns {
431 let mut matches = Vec::new();
433 let key = Arc::from(AnnoKey {
434 ns: ns.into(),
435 name: name.into(),
436 });
437
438 if let Some(key_symbol) = self.anno_keys.get_symbol(&key) {
439 for item in it {
440 let item = item?;
441 if let Some(all_annos) = self.by_container.get(&item)
442 && all_annos
443 .binary_search_by_key(&key_symbol, |a| a.key)
444 .is_ok()
445 {
446 matches.push((item, key.clone()).into());
447 }
448 }
449 }
450 Ok(matches)
451 } else {
452 let matching_key_symbols: Vec<(usize, Arc<AnnoKey>)> = self
453 .get_qnames(name)?
454 .into_iter()
455 .filter_map(|key| {
456 self.anno_keys
457 .get_symbol(&key)
458 .map(|key_symbol| (key_symbol, Arc::from(key)))
459 })
460 .collect();
461 let mut matches = Vec::new();
463 for item in it {
464 let item = item?;
465 for (key_symbol, key) in matching_key_symbols.iter() {
466 if let Some(all_annos) = self.by_container.get(&item)
467 && all_annos
468 .binary_search_by_key(&key_symbol, |a| &a.key)
469 .is_ok()
470 {
471 matches.push((item.clone(), key.clone()).into());
472 }
473 }
474 }
475 Ok(matches)
476 }
477 } else {
478 let mut matches = Vec::new();
480 for item in it {
481 let item = item?;
482 let all_keys = self.get_all_keys_for_item(&item, None, None)?;
483 for anno_key in all_keys {
484 matches.push((item.clone(), anno_key).into());
485 }
486 }
487 Ok(matches)
488 }
489 }
490
491 fn number_of_annotations_by_name(&self, ns: Option<&str>, name: &str) -> Result<usize> {
492 let qualified_keys = match ns {
493 Some(ns) => self.anno_key_sizes.range((
494 Included(AnnoKey {
495 name: name.into(),
496 ns: ns.into(),
497 }),
498 Included(AnnoKey {
499 name: name.into(),
500 ns: ns.into(),
501 }),
502 )),
503 None => self.anno_key_sizes.range(
504 AnnoKey {
505 name: name.into(),
506 ns: String::default(),
507 }..AnnoKey {
508 name: name.into(),
509 ns: std::char::MAX.to_string(),
510 },
511 ),
512 };
513 let mut result = 0;
514 for (_anno_key, anno_size) in qualified_keys {
515 result += anno_size;
516 }
517 Ok(result)
518 }
519
520 fn exact_anno_search<'a>(
521 &'a self,
522 namespace: Option<&str>,
523 name: &str,
524 value: ValueSearch<&str>,
525 ) -> Box<dyn Iterator<Item = Result<Match>> + 'a> {
526 let key_ranges: Vec<Arc<AnnoKey>> = if let Some(ns) = namespace {
527 vec![Arc::from(AnnoKey {
528 ns: ns.into(),
529 name: name.into(),
530 })]
531 } else {
532 let qnames = match self.get_qnames(name) {
533 Ok(qnames) => qnames,
534 Err(e) => return Box::new(std::iter::once(Err(e))),
535 };
536 qnames.into_iter().map(Arc::from).collect()
537 };
538 let value_maps: Vec<(Arc<AnnoKey>, &ValueItemMap<T>)> = key_ranges
541 .into_iter()
542 .filter_map(|key| {
543 let key_id = self.anno_keys.get_symbol(&key)?;
544 self.by_anno
545 .get(&key_id)
546 .map(|values_for_key| (key, values_for_key))
547 })
548 .collect();
549
550 if let ValueSearch::Some(value) = value {
551 let target_value_symbol = self.anno_values.get_symbol(&value.into());
552
553 if let Some(target_value_symbol) = target_value_symbol {
554 let it = value_maps
555 .into_iter()
556 .filter_map(move |(key, values)| {
558 values.get(&target_value_symbol).map(|items| (items, key))
559 })
560 .flat_map(|(items, key)| items.iter().cloned().zip(std::iter::repeat(key)))
562 .map(move |item| Ok(item.into()));
563 Box::new(it)
564 } else {
565 Box::new(std::iter::empty())
567 }
568 } else {
569 let matching_qname_annos = value_maps
571 .into_iter()
572 .flat_map(|(key, values)| {
574 values
575 .iter()
576 .flat_map(|(_, items)| items.iter().cloned())
577 .zip(std::iter::repeat(key))
578 });
579
580 if let ValueSearch::NotSome(value) = value {
581 let value = value.to_string();
582 let it = matching_qname_annos
583 .map(move |(item, anno_key)| {
584 let value = self.get_value_for_item(&item, &anno_key)?;
585 Ok((item, anno_key, value))
586 })
587 .filter_map_ok(move |(item, anno_key, item_value)| {
588 if let Some(item_value) = item_value
589 && item_value != value
590 {
591 return Some((item, anno_key).into());
592 }
593 None
594 });
595 Box::new(it)
596 } else {
597 Box::new(matching_qname_annos.map(move |item| Ok(item.into())))
598 }
599 }
600 }
601
602 fn regex_anno_search<'a>(
603 &'a self,
604 namespace: Option<&str>,
605 name: &str,
606 pattern: &str,
607 negated: bool,
608 ) -> Box<dyn Iterator<Item = Result<Match>> + 'a> {
609 let full_match_pattern = util::regex_full_match(pattern);
610 let compiled_result = regex::Regex::new(&full_match_pattern);
611 if let Ok(re) = compiled_result {
612 let it = self
613 .matching_items(namespace, name, None)
614 .map(move |item| {
615 let (item, anno_key) = item?;
616 let value = self.get_value_for_item(&item, &anno_key)?;
617 Ok((item, anno_key, value))
618 })
619 .filter_ok(move |(_, _, value)| {
620 if let Some(val) = value {
621 if negated {
622 !re.is_match(val)
623 } else {
624 re.is_match(val)
625 }
626 } else {
627 false
628 }
629 })
630 .map_ok(move |(item, anno_key, _)| (item, anno_key).into());
631 Box::new(it)
632 } else if negated {
633 self.exact_anno_search(namespace, name, None.into())
635 } else {
636 Box::new(std::iter::empty())
638 }
639 }
640
641 fn get_all_keys_for_item(
642 &self,
643 item: &T,
644 ns: Option<&str>,
645 name: Option<&str>,
646 ) -> Result<Vec<Arc<AnnoKey>>> {
647 if let Some(name) = name {
648 if let Some(ns) = ns {
649 let key = AnnoKey {
651 ns: ns.into(),
652 name: name.into(),
653 };
654 if let Some(key_symbol) = self.anno_keys.get_symbol(&key)
655 && let Some(all_annos) = self.by_container.get(item)
656 && all_annos
657 .binary_search_by_key(&key_symbol, |a| a.key)
658 .is_ok()
659 {
660 return Ok(vec![Arc::from(key)]);
661 }
662
663 Ok(vec![])
664 } else {
665 let res: Result<Vec<Arc<AnnoKey>>> = self
667 .get_qnames(name)?
668 .into_iter()
669 .map(|anno_key| {
670 let value = self.get_value_for_item(item, &anno_key)?;
671 Ok((anno_key, value))
672 })
673 .filter_ok(|(_key, value)| value.is_some())
674 .map_ok(|(key, _)| Arc::from(key))
675 .collect();
676 res
677 }
678 } else if let Some(all_annos) = self.by_container.get(item) {
679 let mut result: Vec<Arc<AnnoKey>> = Vec::with_capacity(all_annos.len());
681 for a in all_annos.iter() {
682 if let Some(key) = self.anno_keys.get_value(a.key) {
683 result.push(key);
684 }
685 }
686 Ok(result)
687 } else {
688 Ok(vec![])
690 }
691 }
692
693 fn guess_max_count(
694 &self,
695 ns: Option<&str>,
696 name: &str,
697 lower_val: &str,
698 upper_val: &str,
699 ) -> Result<usize> {
700 let qualified_keys = match ns {
702 Some(ns) => vec![AnnoKey {
703 name: name.into(),
704 ns: ns.into(),
705 }],
706 None => self.get_qnames(name)?,
707 };
708
709 let mut universe_size: usize = 0;
710 let mut sum_histogram_buckets: usize = 0;
711 let mut count_matches: usize = 0;
712
713 for anno_key in qualified_keys {
715 if let Some(anno_size) = self.anno_key_sizes.get(&anno_key) {
716 universe_size += *anno_size;
717
718 if let Some(anno_key) = self.anno_keys.get_symbol(&anno_key)
719 && let Some(histo) = self.histogram_bounds.get(&anno_key)
720 {
721 if histo.len() >= 2 {
724 sum_histogram_buckets += histo.len() - 1;
725
726 for i in 0..histo.len() - 1 {
727 let bucket_begin = &histo[i];
728 let bucket_end = &histo[i + 1];
729 if bucket_begin.as_str() <= upper_val
731 && lower_val <= bucket_end.as_str()
732 {
733 count_matches += 1;
734 }
735 }
736 }
737 }
738 }
739 }
740
741 if sum_histogram_buckets > 0 {
742 let selectivity: f64 = (count_matches as f64) / (sum_histogram_buckets as f64);
743 Ok((selectivity * (universe_size as f64)).round() as usize)
744 } else {
745 Ok(0)
746 }
747 }
748
749 fn guess_max_count_regex(&self, ns: Option<&str>, name: &str, pattern: &str) -> Result<usize> {
750 let full_match_pattern = util::regex_full_match(pattern);
751
752 let total = self.number_of_annotations_by_name(ns, name)?;
755
756 let parsed = regex_syntax::Parser::new().parse(&full_match_pattern);
758 if let Ok(parsed) = parsed {
759 let mut guessed_count = 0;
760
761 if let Some(prefix_set) = regex_syntax::hir::literal::Extractor::new()
763 .extract(&parsed)
764 .literals()
765 {
766 for val_prefix in prefix_set {
767 let val_prefix = std::str::from_utf8(val_prefix.as_bytes());
768 if let Ok(lower_val) = val_prefix {
769 let mut upper_val = String::from(lower_val);
770 upper_val.push(std::char::MAX);
771 guessed_count += self.guess_max_count(ns, name, lower_val, &upper_val)?;
772 }
773 }
774 } else {
775 if let Ok(pattern) = regex::Regex::new(&full_match_pattern) {
778 let mut rng = rand::rng();
779 let qualified_keys: Vec<_> = match ns {
780 Some(ns) => vec![AnnoKey {
781 name: name.into(),
782 ns: ns.into(),
783 }],
784 None => self.get_qnames(name)?,
785 }
786 .into_iter()
787 .filter_map(|key| self.anno_keys.get_symbol(&key).map(|symbol| (symbol, key)))
788 .collect();
789 for (anno_key_symbol, anno_key) in qualified_keys {
790 let anno_size = self
791 .anno_key_sizes
792 .get(&anno_key)
793 .copied()
794 .unwrap_or_default();
795
796 if let Some(histo) = self.histogram_bounds.get(&anno_key_symbol)
797 && !histo.is_empty()
798 {
799 let sampled_values = histo.iter().choose_multiple(&mut rng, 20);
800 let matches = sampled_values
801 .iter()
802 .filter(|v| pattern.is_match(v))
803 .count();
804 if sampled_values.len() == matches {
805 guessed_count += anno_size;
807 } else if matches == 0 {
808 guessed_count +=
810 (anno_size as f64 / sampled_values.len() as f64) as usize;
811 } else {
812 let match_ratio = (matches as f64) / (sampled_values.len() as f64);
814 guessed_count += ((anno_size as f64) * match_ratio) as usize;
815 }
816 }
817 }
818 }
819 }
820
821 Ok(guessed_count.min(total))
822 } else {
823 Ok(0)
824 }
825 }
826
827 fn guess_most_frequent_value(
828 &self,
829 ns: Option<&str>,
830 name: &str,
831 ) -> Result<Option<Cow<'_, str>>> {
832 let qualified_keys = match ns {
834 Some(ns) => vec![AnnoKey {
835 name: name.into(),
836 ns: ns.into(),
837 }],
838 None => self.get_qnames(name)?,
839 };
840
841 let mut sampled_values: HashMap<&str, usize> = HashMap::default();
842
843 for anno_key in qualified_keys {
845 if let Some(anno_key) = self.anno_keys.get_symbol(&anno_key)
846 && let Some(histo) = self.histogram_bounds.get(&anno_key)
847 {
848 for v in histo.iter() {
849 let count: &mut usize = sampled_values.entry(v).or_insert(0);
850 *count += 1;
851 }
852 }
853 }
854 if !sampled_values.is_empty() {
856 let mut max_count = 0;
857 let mut max_value = Cow::Borrowed("");
858 for (v, count) in sampled_values.into_iter() {
859 if count >= max_count {
860 max_value = Cow::Borrowed(v);
861 max_count = count;
862 }
863 }
864 Ok(Some(max_value))
865 } else {
866 Ok(None)
867 }
868 }
869
870 fn get_all_values(
871 &self,
872 key: &AnnoKey,
873 most_frequent_first: bool,
874 ) -> Result<Vec<Cow<'_, str>>> {
875 if let Some(key) = self.anno_keys.get_symbol(key)
876 && let Some(values_for_key) = self.by_anno.get(&key)
877 {
878 if most_frequent_first {
879 let result = values_for_key
880 .iter()
881 .filter_map(|(val, items)| {
882 let val = self.anno_values.get_value_ref(*val)?;
883 Some((items.len(), val))
884 })
885 .sorted()
886 .rev()
887 .map(|(_, val)| Cow::Borrowed(&val[..]))
888 .collect();
889 return Ok(result);
890 } else {
891 let result = values_for_key
892 .iter()
893 .filter_map(|(val, _items)| self.anno_values.get_value_ref(*val))
894 .map(|val| Cow::Borrowed(&val[..]))
895 .collect();
896 return Ok(result);
897 }
898 }
899 Ok(vec![])
900 }
901
902 fn annotation_keys(&self) -> Result<Vec<AnnoKey>> {
903 Ok(self.anno_key_sizes.keys().cloned().collect())
904 }
905
906 fn get_largest_item(&self) -> Result<Option<T>> {
907 Ok(self.largest_item.clone())
908 }
909
910 fn calculate_statistics(&mut self) -> Result<()> {
911 let max_histogram_buckets = 250;
912 let max_sampled_annotations = 2500;
913
914 self.histogram_bounds.clear();
915
916 for anno_key in self.anno_key_sizes.keys() {
918 if let Some(anno_key) = self.anno_keys.get_symbol(anno_key) {
919 let mut rng = rand::rng();
921 if let Some(values_for_key) = self.by_anno.get(&anno_key) {
922 let sampled_anno_values: Vec<usize> = values_for_key
923 .iter()
924 .flat_map(|(val, items)| {
925 let v = vec![*val; items.len()];
927 v.into_iter()
928 })
929 .collect();
930 let sampled_anno_indexes: FxHashSet<usize> = rand::seq::index::sample(
931 &mut rng,
932 sampled_anno_values.len(),
933 std::cmp::min(sampled_anno_values.len(), max_sampled_annotations),
934 )
935 .into_iter()
936 .collect();
937
938 let mut sampled_anno_values: Vec<String> = sampled_anno_values
939 .into_iter()
940 .enumerate()
941 .filter(|x| sampled_anno_indexes.contains(&x.0))
942 .filter_map(|x| self.anno_values.get_value_ref(x.1).cloned())
943 .collect();
944 sampled_anno_values.sort();
946
947 let num_hist_bounds = if sampled_anno_values.len() < (max_histogram_buckets + 1)
948 {
949 sampled_anno_values.len()
950 } else {
951 max_histogram_buckets + 1
952 };
953
954 let hist = self.histogram_bounds.entry(anno_key).or_default();
955
956 if num_hist_bounds >= 2 {
957 hist.resize(num_hist_bounds, String::from(""));
958
959 let delta: usize = (sampled_anno_values.len() - 1) / (num_hist_bounds - 1);
960 let delta_fraction: usize =
961 (sampled_anno_values.len() - 1) % (num_hist_bounds - 1);
962
963 let mut pos = 0;
964 let mut pos_fraction = 0;
965 for hist_item in hist.iter_mut() {
966 *hist_item = sampled_anno_values[pos].clone();
967 pos += delta;
968 pos_fraction += delta_fraction;
969
970 if pos_fraction >= (num_hist_bounds - 1) {
971 pos += 1;
972 pos_fraction -= num_hist_bounds - 1;
973 }
974 }
975 }
976 }
977 }
978 }
979 Ok(())
980 }
981
982 fn load_annotations_from(&mut self, location: &Path) -> Result<()> {
983 self.clear_internal();
985
986 let path = location.join("nodes_v1.bin");
987 let f = std::fs::File::open(path.clone()).map_err(|e| {
988 GraphAnnisCoreError::LoadingAnnotationStorage {
989 path: path.to_string_lossy().to_string(),
990 source: e,
991 }
992 })?;
993 let mut reader = std::io::BufReader::new(f);
994 *self = bincode::deserialize_from(&mut reader)?;
995
996 self.anno_keys.after_deserialization();
997 self.anno_values.after_deserialization();
998
999 Ok(())
1000 }
1001
1002 fn save_annotations_to(&self, location: &Path) -> Result<()> {
1003 let f = std::fs::File::create(location.join("nodes_v1.bin"))?;
1004 let mut writer = std::io::BufWriter::new(f);
1005 bincode::serialize_into(&mut writer, self)?;
1006
1007 Ok(())
1008 }
1009}
1010
1011impl NodeAnnotationStorage for AnnoStorageImpl<NodeID> {
1012 fn get_node_id_from_name(&self, node_name: &str) -> Result<Option<NodeID>> {
1013 if let (Some(anno_name_symbol), Some(value_symbol)) = (
1014 self.anno_keys.get_symbol(&NODE_NAME_KEY),
1015 self.anno_values.get_symbol(&String::from(node_name)),
1016 ) && let Some(items_with_anno) = self.by_anno.get(&anno_name_symbol)
1017 && let Some(items) = items_with_anno.get(&value_symbol)
1018 {
1019 return Ok(items.iter().copied().next());
1020 }
1021
1022 Ok(None)
1023 }
1024
1025 fn has_node_name(&self, node_name: &str) -> Result<bool> {
1026 if let (Some(anno_name_symbol), Some(value_symbol)) = (
1027 self.anno_keys.get_symbol(&NODE_NAME_KEY),
1028 self.anno_values.get_symbol(&String::from(node_name)),
1029 ) && let Some(items_with_anno) = self.by_anno.get(&anno_name_symbol)
1030 && let Some(items) = items_with_anno.get(&value_symbol)
1031 {
1032 return Ok(!items.is_empty());
1033 }
1034
1035 Ok(false)
1036 }
1037}
1038
1039impl EdgeAnnotationStorage for AnnoStorageImpl<Edge> {}
1040
1041impl AnnoStorageImpl<Edge> {
1042 pub fn after_deserialization(&mut self) {
1043 self.anno_keys.after_deserialization();
1044 self.anno_values.after_deserialization();
1045 }
1046}
1047
1048#[cfg(test)]
1049mod tests;