graphannis_core/annostorage/
inmemory.rs

1use super::{AnnotationStorage, EdgeAnnotationStorage, Match, NodeAnnotationStorage};
2use crate::annostorage::ValueSearch;
3use crate::errors::Result;
4use crate::graph::NODE_NAME_KEY;
5use crate::types::{AnnoKey, Annotation, Edge, NodeID};
6use crate::util::{self};
7use crate::{annostorage::symboltable::SymbolTable, errors::GraphAnnisCoreError};
8use core::ops::Bound::*;
9use itertools::Itertools;
10use rand::seq::IteratorRandom;
11use rustc_hash::FxHashSet;
12use std::borrow::Cow;
13use std::collections::{BTreeMap, BTreeSet, HashMap};
14use std::hash::Hash;
15use std::path::Path;
16use std::sync::Arc;
17
18#[derive(Serialize, Deserialize, Clone, Debug, Default, Copy)]
19struct SparseAnnotation {
20    key: usize,
21    val: usize,
22}
23
24type ValueItemMap<T> = HashMap<usize, BTreeSet<T>>;
25
26#[derive(Serialize, Deserialize, Clone, Default)]
27pub struct AnnoStorageImpl<T: Ord + Hash + Default> {
28    by_container: HashMap<T, Vec<SparseAnnotation>>,
29    /// A map from an annotation key symbol to a map of all its values to the items having this value for the annotation key
30    by_anno: HashMap<usize, ValueItemMap<T>>,
31    /// Maps a distinct annotation key to the number of elements having this annotation key.
32    anno_key_sizes: BTreeMap<AnnoKey, usize>,
33    anno_keys: SymbolTable<AnnoKey>,
34    anno_values: SymbolTable<String>,
35
36    /// Sampled histograms for each annotation key .
37    /// Each histogram bound defines a range of values where we estimate that they have the same number of occurences.
38    histogram_bounds: BTreeMap<usize, Vec<String>>,
39    largest_item: Option<T>,
40    total_number_of_annos: usize,
41}
42
43impl<T: Ord + Hash + Clone + serde::Serialize + serde::de::DeserializeOwned + Default>
44    AnnoStorageImpl<T>
45{
46    pub fn new() -> AnnoStorageImpl<T> {
47        AnnoStorageImpl {
48            by_container: HashMap::default(),
49            by_anno: HashMap::default(),
50            anno_keys: SymbolTable::new(),
51            anno_values: SymbolTable::new(),
52            anno_key_sizes: BTreeMap::new(),
53            histogram_bounds: BTreeMap::new(),
54            largest_item: None,
55            total_number_of_annos: 0,
56        }
57    }
58
59    fn clear_internal(&mut self) {
60        self.by_container.clear();
61        self.by_anno.clear();
62        self.anno_keys.clear();
63        self.anno_key_sizes.clear();
64        self.histogram_bounds.clear();
65        self.largest_item = None;
66        self.anno_values.clear();
67    }
68
69    fn create_sparse_anno(&mut self, orig: Annotation) -> Result<SparseAnnotation> {
70        let key = self.anno_keys.insert(orig.key)?;
71        let val = self.anno_values.insert(orig.val)?;
72
73        Ok(SparseAnnotation { key, val })
74    }
75
76    fn create_annotation_from_sparse(&self, orig: &SparseAnnotation) -> Option<Annotation> {
77        let key = self.anno_keys.get_value_ref(orig.key)?;
78        let val = self.anno_values.get_value_ref(orig.val)?;
79
80        Some(Annotation {
81            key: key.clone(),
82            val: val.clone(),
83        })
84    }
85
86    fn remove_element_from_by_anno(&mut self, anno: &SparseAnnotation, item: &T) {
87        let remove_anno_key = if let Some(annos_for_key) = self.by_anno.get_mut(&anno.key) {
88            let remove_anno_val = if let Some(items_for_anno) = annos_for_key.get_mut(&anno.val) {
89                items_for_anno.remove(item);
90                items_for_anno.is_empty()
91            } else {
92                false
93            };
94            // remove the hash set of items for the original annotation if it empty
95            if remove_anno_val {
96                annos_for_key.remove(&anno.val);
97                annos_for_key.is_empty()
98            } else {
99                false
100            }
101        } else {
102            false
103        };
104        if remove_anno_key {
105            self.by_anno.remove(&anno.key);
106            // TODO: remove from symbol table?
107        }
108    }
109
110    fn check_and_remove_value_symbol(&mut self, value_id: usize) {
111        let mut still_used = false;
112        for values in self.by_anno.values() {
113            if values.contains_key(&value_id) {
114                still_used = true;
115                break;
116            }
117        }
118        if !still_used {
119            self.anno_values.remove(value_id);
120        }
121    }
122}
123
124impl<T> AnnoStorageImpl<T>
125where
126    T: Ord + Hash + Default + Clone + serde::Serialize + serde::de::DeserializeOwned + Send + Sync,
127    (T, Arc<AnnoKey>): Into<Match>,
128{
129    fn matching_items<'a>(
130        &'a self,
131        namespace: Option<&str>,
132        name: &str,
133        value: Option<&str>,
134    ) -> Box<dyn Iterator<Item = Result<(T, Arc<AnnoKey>)>> + 'a> {
135        let key_ranges: Vec<Arc<AnnoKey>> = if let Some(ns) = namespace {
136            vec![Arc::from(AnnoKey {
137                ns: ns.into(),
138                name: name.into(),
139            })]
140        } else {
141            let qnames = match self.get_qnames(name) {
142                Ok(qnames) => qnames,
143                Err(e) => return Box::new(std::iter::once(Err(e))),
144            };
145            qnames.into_iter().map(Arc::from).collect()
146        };
147        // Create a vector fore each matching AnnoKey to the value map containing all items and their annotation values
148        // for this key.
149        let value_maps: Vec<(Arc<AnnoKey>, &ValueItemMap<T>)> = key_ranges
150            .into_iter()
151            .filter_map(|key| {
152                let key_id = self.anno_keys.get_symbol(&key)?;
153                self.by_anno
154                    .get(&key_id)
155                    .map(|values_for_key| (key, values_for_key))
156            })
157            .collect();
158
159        if let Some(value) = value {
160            let target_value_symbol = self.anno_values.get_symbol(&value.into());
161
162            if let Some(target_value_symbol) = target_value_symbol {
163                let it = value_maps
164                    .into_iter()
165                    // find the items with the correct value
166                    .filter_map(move |(key, values)| {
167                        values.get(&target_value_symbol).map(|items| (items, key))
168                    })
169                    // flatten the hash set of all items, returns all items for the condition
170                    .flat_map(|(items, key)| items.iter().cloned().zip(std::iter::repeat(key)))
171                    .map(Ok);
172                Box::new(it)
173            } else {
174                // value is not known, return empty result
175                Box::new(std::iter::empty())
176            }
177        } else {
178            let it = value_maps
179                .into_iter()
180                // flatten the hash set of all items of the value map
181                .flat_map(|(key, values)| {
182                    values
183                        .iter()
184                        .flat_map(|(_, items)| items.iter().cloned())
185                        .zip(std::iter::repeat(key))
186                })
187                .map(Ok);
188            Box::new(it)
189        }
190    }
191}
192
193impl<T> AnnotationStorage<T> for AnnoStorageImpl<T>
194where
195    T: Ord + Hash + Default + Clone + Send + Sync + serde::Serialize + serde::de::DeserializeOwned,
196    (T, Arc<AnnoKey>): Into<Match>,
197{
198    fn insert(&mut self, item: T, anno: Annotation) -> Result<()> {
199        let orig_anno_key = anno.key.clone();
200        let anno = self.create_sparse_anno(anno)?;
201
202        let existing_anno = {
203            let existing_item_entry = self.by_container.entry(item.clone()).or_default();
204
205            // check if there is already an item with the same annotation key
206            let existing_entry_idx = existing_item_entry.binary_search_by_key(&anno.key, |a| a.key);
207
208            if let Ok(existing_entry_idx) = existing_entry_idx {
209                let orig_anno = existing_item_entry[existing_entry_idx];
210                // abort if the same annotation key with the same value already exist
211                if orig_anno.val == anno.val {
212                    return Ok(());
213                }
214                // insert annotation for item at existing position
215                existing_item_entry[existing_entry_idx] = anno;
216                Some(orig_anno)
217            } else if let Err(insertion_idx) = existing_entry_idx {
218                // insert at sorted position -> the result will still be a sorted vector
219                existing_item_entry.insert(insertion_idx, anno);
220                None
221            } else {
222                None
223            }
224        };
225
226        if let Some(existing_anno) = &existing_anno {
227            // remove the relation from the original annotation to this item
228            self.remove_element_from_by_anno(existing_anno, &item);
229        }
230
231        // inserts a new relation between the annotation and the item
232        // if set is not existing yet it is created
233        let item_list_for_value = self
234            .by_anno
235            .entry(anno.key)
236            .or_default()
237            .entry(anno.val)
238            .or_default();
239        item_list_for_value.insert(item.clone());
240
241        if existing_anno.is_none() {
242            // a new annotation entry was inserted and did not replace an existing one
243            self.total_number_of_annos += 1;
244
245            if let Some(largest_item) = self.largest_item.clone() {
246                if largest_item < item {
247                    self.largest_item = Some(item);
248                }
249            } else {
250                self.largest_item = Some(item);
251            }
252
253            let anno_key_entry = self.anno_key_sizes.entry(orig_anno_key).or_insert(0);
254            *anno_key_entry += 1;
255        }
256
257        Ok(())
258    }
259
260    fn remove_annotation_for_item(
261        &mut self,
262        item: &T,
263        key: &AnnoKey,
264    ) -> Result<Option<Cow<'_, str>>> {
265        let mut result = None;
266
267        let orig_key = key;
268        if let Some(key) = self.anno_keys.get_symbol(key)
269            && let Some(mut all_annos) = self.by_container.remove(item)
270        {
271            // find the specific annotation key from the sorted vector of all annotations of this item
272            let anno_idx = all_annos.binary_search_by_key(&key, |a| a.key);
273
274            if let Ok(anno_idx) = anno_idx {
275                // since value was found, also remove the item from the other containers
276                self.remove_element_from_by_anno(&all_annos[anno_idx], item);
277
278                let old_value = all_annos[anno_idx].val;
279
280                // remove the specific annotation key from the entry
281                all_annos.remove(anno_idx);
282
283                // decrease the annotation count for this key
284                let new_key_count: usize =
285                    if let Some(num_of_keys) = self.anno_key_sizes.get_mut(orig_key) {
286                        *num_of_keys -= 1;
287                        *num_of_keys
288                    } else {
289                        0
290                    };
291                // if annotation count dropped to zero remove the key
292                if new_key_count == 0 {
293                    self.by_anno.remove(&key);
294                    self.anno_key_sizes.remove(orig_key);
295                    self.anno_keys.remove(key);
296                }
297
298                result = self
299                    .anno_values
300                    .get_value_ref(old_value)
301                    .map(|v| Cow::Owned(v.clone()));
302
303                self.check_and_remove_value_symbol(old_value);
304                self.total_number_of_annos -= 1;
305            }
306            // if there are more annotations for this item, re-insert them
307            if !all_annos.is_empty() {
308                self.by_container.insert(item.clone(), all_annos);
309            }
310        }
311
312        Ok(result)
313    }
314
315    fn remove_item(&mut self, item: &T) -> Result<bool> {
316        let mut result = false;
317
318        if let Some(all_annos) = self.by_container.remove(item) {
319            for anno in all_annos {
320                // since value was found, also remove the item from the other containers
321                self.remove_element_from_by_anno(&anno, item);
322
323                if let Some(resolved_key) = self.anno_keys.get_value(anno.key) {
324                    // decrease the annotation count for this key
325                    let new_key_count: usize =
326                        if let Some(num_of_keys) = self.anno_key_sizes.get_mut(&resolved_key) {
327                            *num_of_keys -= 1;
328                            *num_of_keys
329                        } else {
330                            0
331                        };
332                    // if annotation count dropped to zero remove the key
333                    if new_key_count == 0 {
334                        self.by_anno.remove(&anno.key);
335                        self.anno_key_sizes.remove(&resolved_key);
336                        self.anno_keys.remove(anno.key);
337                    }
338                }
339
340                result = true;
341                self.check_and_remove_value_symbol(anno.val);
342                self.total_number_of_annos -= 1;
343            }
344        }
345
346        Ok(result)
347    }
348
349    fn clear(&mut self) -> Result<()> {
350        self.clear_internal();
351        Ok(())
352    }
353
354    fn get_qnames(&self, name: &str) -> Result<Vec<AnnoKey>> {
355        let it = self.anno_key_sizes.range(
356            AnnoKey {
357                name: name.into(),
358                ns: String::default(),
359            }..,
360        );
361        let mut result: Vec<AnnoKey> = Vec::default();
362        for (k, _) in it {
363            if k.name == name {
364                result.push(k.clone());
365            } else {
366                break;
367            }
368        }
369        Ok(result)
370    }
371
372    fn get_annotations_for_item(&self, item: &T) -> Result<Vec<Annotation>> {
373        if let Some(all_annos) = self.by_container.get(item) {
374            let mut result: Vec<Annotation> = Vec::with_capacity(all_annos.len());
375            for a in all_annos.iter() {
376                if let Some(a) = self.create_annotation_from_sparse(a) {
377                    result.push(a);
378                }
379            }
380            return Ok(result);
381        }
382        // return empty result if not found
383        Ok(Vec::new())
384    }
385
386    fn number_of_annotations(&self) -> Result<usize> {
387        Ok(self.total_number_of_annos)
388    }
389
390    fn is_empty(&self) -> Result<bool> {
391        Ok(self.total_number_of_annos == 0)
392    }
393
394    fn get_value_for_item(&self, item: &T, key: &AnnoKey) -> Result<Option<Cow<'_, str>>> {
395        if let (Some(key_symbol), Some(all_annos)) =
396            (self.anno_keys.get_symbol(key), self.by_container.get(item))
397        {
398            let idx = all_annos.binary_search_by_key(&key_symbol, |a| a.key);
399            if let Ok(idx) = idx
400                && let Some(val) = self.anno_values.get_value_ref(all_annos[idx].val)
401            {
402                return Ok(Some(Cow::Borrowed(val)));
403            }
404        }
405        Ok(None)
406    }
407
408    fn has_value_for_item(&self, item: &T, key: &AnnoKey) -> Result<bool> {
409        if let Some(key_symbol) = self.anno_keys.get_symbol(key)
410            && let Some(all_annos) = self.by_container.get(item)
411            && all_annos
412                .binary_search_by_key(&key_symbol, |a| a.key)
413                .is_ok()
414        {
415            return Ok(true);
416        }
417        Ok(false)
418    }
419
420    fn get_keys_for_iterator<'b>(
421        &'b self,
422        ns: Option<&str>,
423        name: Option<&str>,
424        it: Box<
425            dyn Iterator<Item = std::result::Result<T, Box<dyn std::error::Error + Send + Sync>>>
426                + 'b,
427        >,
428    ) -> Result<Vec<Match>> {
429        if let Some(name) = name {
430            if let Some(ns) = ns {
431                // return the only possible annotation for each node
432                let mut matches = Vec::new();
433                let key = Arc::from(AnnoKey {
434                    ns: ns.into(),
435                    name: name.into(),
436                });
437
438                if let Some(key_symbol) = self.anno_keys.get_symbol(&key) {
439                    for item in it {
440                        let item = item?;
441                        if let Some(all_annos) = self.by_container.get(&item)
442                            && all_annos
443                                .binary_search_by_key(&key_symbol, |a| a.key)
444                                .is_ok()
445                        {
446                            matches.push((item, key.clone()).into());
447                        }
448                    }
449                }
450                Ok(matches)
451            } else {
452                let matching_key_symbols: Vec<(usize, Arc<AnnoKey>)> = self
453                    .get_qnames(name)?
454                    .into_iter()
455                    .filter_map(|key| {
456                        self.anno_keys
457                            .get_symbol(&key)
458                            .map(|key_symbol| (key_symbol, Arc::from(key)))
459                    })
460                    .collect();
461                // return all annotations with the correct name for each node
462                let mut matches = Vec::new();
463                for item in it {
464                    let item = item?;
465                    for (key_symbol, key) in matching_key_symbols.iter() {
466                        if let Some(all_annos) = self.by_container.get(&item)
467                            && all_annos
468                                .binary_search_by_key(&key_symbol, |a| &a.key)
469                                .is_ok()
470                        {
471                            matches.push((item.clone(), key.clone()).into());
472                        }
473                    }
474                }
475                Ok(matches)
476            }
477        } else {
478            // return all annotations for each node
479            let mut matches = Vec::new();
480            for item in it {
481                let item = item?;
482                let all_keys = self.get_all_keys_for_item(&item, None, None)?;
483                for anno_key in all_keys {
484                    matches.push((item.clone(), anno_key).into());
485                }
486            }
487            Ok(matches)
488        }
489    }
490
491    fn number_of_annotations_by_name(&self, ns: Option<&str>, name: &str) -> Result<usize> {
492        let qualified_keys = match ns {
493            Some(ns) => self.anno_key_sizes.range((
494                Included(AnnoKey {
495                    name: name.into(),
496                    ns: ns.into(),
497                }),
498                Included(AnnoKey {
499                    name: name.into(),
500                    ns: ns.into(),
501                }),
502            )),
503            None => self.anno_key_sizes.range(
504                AnnoKey {
505                    name: name.into(),
506                    ns: String::default(),
507                }..AnnoKey {
508                    name: name.into(),
509                    ns: std::char::MAX.to_string(),
510                },
511            ),
512        };
513        let mut result = 0;
514        for (_anno_key, anno_size) in qualified_keys {
515            result += anno_size;
516        }
517        Ok(result)
518    }
519
520    fn exact_anno_search<'a>(
521        &'a self,
522        namespace: Option<&str>,
523        name: &str,
524        value: ValueSearch<&str>,
525    ) -> Box<dyn Iterator<Item = Result<Match>> + 'a> {
526        let key_ranges: Vec<Arc<AnnoKey>> = if let Some(ns) = namespace {
527            vec![Arc::from(AnnoKey {
528                ns: ns.into(),
529                name: name.into(),
530            })]
531        } else {
532            let qnames = match self.get_qnames(name) {
533                Ok(qnames) => qnames,
534                Err(e) => return Box::new(std::iter::once(Err(e))),
535            };
536            qnames.into_iter().map(Arc::from).collect()
537        };
538        // Create a vector for each matching AnnoKey to the value map containing all items and their annotation values
539        // for this key.
540        let value_maps: Vec<(Arc<AnnoKey>, &ValueItemMap<T>)> = key_ranges
541            .into_iter()
542            .filter_map(|key| {
543                let key_id = self.anno_keys.get_symbol(&key)?;
544                self.by_anno
545                    .get(&key_id)
546                    .map(|values_for_key| (key, values_for_key))
547            })
548            .collect();
549
550        if let ValueSearch::Some(value) = value {
551            let target_value_symbol = self.anno_values.get_symbol(&value.into());
552
553            if let Some(target_value_symbol) = target_value_symbol {
554                let it = value_maps
555                    .into_iter()
556                    // find the items with the correct value
557                    .filter_map(move |(key, values)| {
558                        values.get(&target_value_symbol).map(|items| (items, key))
559                    })
560                    // flatten the hash set of all items, returns all items for the condition
561                    .flat_map(|(items, key)| items.iter().cloned().zip(std::iter::repeat(key)))
562                    .map(move |item| Ok(item.into()));
563                Box::new(it)
564            } else {
565                // value is not known, return empty result
566                Box::new(std::iter::empty())
567            }
568        } else {
569            // Search for all annotations having a matching qualified name, regardless of the value
570            let matching_qname_annos = value_maps
571                .into_iter()
572                // flatten the hash set of all items of the value map
573                .flat_map(|(key, values)| {
574                    values
575                        .iter()
576                        .flat_map(|(_, items)| items.iter().cloned())
577                        .zip(std::iter::repeat(key))
578                });
579
580            if let ValueSearch::NotSome(value) = value {
581                let value = value.to_string();
582                let it = matching_qname_annos
583                    .map(move |(item, anno_key)| {
584                        let value = self.get_value_for_item(&item, &anno_key)?;
585                        Ok((item, anno_key, value))
586                    })
587                    .filter_map_ok(move |(item, anno_key, item_value)| {
588                        if let Some(item_value) = item_value
589                            && item_value != value
590                        {
591                            return Some((item, anno_key).into());
592                        }
593                        None
594                    });
595                Box::new(it)
596            } else {
597                Box::new(matching_qname_annos.map(move |item| Ok(item.into())))
598            }
599        }
600    }
601
602    fn regex_anno_search<'a>(
603        &'a self,
604        namespace: Option<&str>,
605        name: &str,
606        pattern: &str,
607        negated: bool,
608    ) -> Box<dyn Iterator<Item = Result<Match>> + 'a> {
609        let full_match_pattern = util::regex_full_match(pattern);
610        let compiled_result = regex::Regex::new(&full_match_pattern);
611        if let Ok(re) = compiled_result {
612            let it = self
613                .matching_items(namespace, name, None)
614                .map(move |item| {
615                    let (item, anno_key) = item?;
616                    let value = self.get_value_for_item(&item, &anno_key)?;
617                    Ok((item, anno_key, value))
618                })
619                .filter_ok(move |(_, _, value)| {
620                    if let Some(val) = value {
621                        if negated {
622                            !re.is_match(val)
623                        } else {
624                            re.is_match(val)
625                        }
626                    } else {
627                        false
628                    }
629                })
630                .map_ok(move |(item, anno_key, _)| (item, anno_key).into());
631            Box::new(it)
632        } else if negated {
633            // return all values
634            self.exact_anno_search(namespace, name, None.into())
635        } else {
636            // if regular expression pattern is invalid return empty iterator
637            Box::new(std::iter::empty())
638        }
639    }
640
641    fn get_all_keys_for_item(
642        &self,
643        item: &T,
644        ns: Option<&str>,
645        name: Option<&str>,
646    ) -> Result<Vec<Arc<AnnoKey>>> {
647        if let Some(name) = name {
648            if let Some(ns) = ns {
649                // fully qualified search
650                let key = AnnoKey {
651                    ns: ns.into(),
652                    name: name.into(),
653                };
654                if let Some(key_symbol) = self.anno_keys.get_symbol(&key)
655                    && let Some(all_annos) = self.by_container.get(item)
656                    && all_annos
657                        .binary_search_by_key(&key_symbol, |a| a.key)
658                        .is_ok()
659                {
660                    return Ok(vec![Arc::from(key)]);
661                }
662
663                Ok(vec![])
664            } else {
665                // get all qualified names for the given annotation name
666                let res: Result<Vec<Arc<AnnoKey>>> = self
667                    .get_qnames(name)?
668                    .into_iter()
669                    .map(|anno_key| {
670                        let value = self.get_value_for_item(item, &anno_key)?;
671                        Ok((anno_key, value))
672                    })
673                    .filter_ok(|(_key, value)| value.is_some())
674                    .map_ok(|(key, _)| Arc::from(key))
675                    .collect();
676                res
677            }
678        } else if let Some(all_annos) = self.by_container.get(item) {
679            // no annotation name given, return all
680            let mut result: Vec<Arc<AnnoKey>> = Vec::with_capacity(all_annos.len());
681            for a in all_annos.iter() {
682                if let Some(key) = self.anno_keys.get_value(a.key) {
683                    result.push(key);
684                }
685            }
686            Ok(result)
687        } else {
688            // return empty result if not found
689            Ok(vec![])
690        }
691    }
692
693    fn guess_max_count(
694        &self,
695        ns: Option<&str>,
696        name: &str,
697        lower_val: &str,
698        upper_val: &str,
699    ) -> Result<usize> {
700        // find all complete keys which have the given name (and namespace if given)
701        let qualified_keys = match ns {
702            Some(ns) => vec![AnnoKey {
703                name: name.into(),
704                ns: ns.into(),
705            }],
706            None => self.get_qnames(name)?,
707        };
708
709        let mut universe_size: usize = 0;
710        let mut sum_histogram_buckets: usize = 0;
711        let mut count_matches: usize = 0;
712
713        // guess for each fully qualified annotation key and return the sum of all guesses
714        for anno_key in qualified_keys {
715            if let Some(anno_size) = self.anno_key_sizes.get(&anno_key) {
716                universe_size += *anno_size;
717
718                if let Some(anno_key) = self.anno_keys.get_symbol(&anno_key)
719                    && let Some(histo) = self.histogram_bounds.get(&anno_key)
720                {
721                    // find the range in which the value is contained
722                    // we need to make sure the histogram is not empty -> should have at least two bounds
723                    if histo.len() >= 2 {
724                        sum_histogram_buckets += histo.len() - 1;
725
726                        for i in 0..histo.len() - 1 {
727                            let bucket_begin = &histo[i];
728                            let bucket_end = &histo[i + 1];
729                            // check if the range overlaps with the search range
730                            if bucket_begin.as_str() <= upper_val
731                                && lower_val <= bucket_end.as_str()
732                            {
733                                count_matches += 1;
734                            }
735                        }
736                    }
737                }
738            }
739        }
740
741        if sum_histogram_buckets > 0 {
742            let selectivity: f64 = (count_matches as f64) / (sum_histogram_buckets as f64);
743            Ok((selectivity * (universe_size as f64)).round() as usize)
744        } else {
745            Ok(0)
746        }
747    }
748
749    fn guess_max_count_regex(&self, ns: Option<&str>, name: &str, pattern: &str) -> Result<usize> {
750        let full_match_pattern = util::regex_full_match(pattern);
751
752        // Get the total number of annotations with the namespace/name. We
753        // can't get larger than this number
754        let total = self.number_of_annotations_by_name(ns, name)?;
755
756        // Try to parse the regular expression
757        let parsed = regex_syntax::Parser::new().parse(&full_match_pattern);
758        if let Ok(parsed) = parsed {
759            let mut guessed_count = 0;
760
761            // Add the guessed count for each prefix
762            if let Some(prefix_set) = regex_syntax::hir::literal::Extractor::new()
763                .extract(&parsed)
764                .literals()
765            {
766                for val_prefix in prefix_set {
767                    let val_prefix = std::str::from_utf8(val_prefix.as_bytes());
768                    if let Ok(lower_val) = val_prefix {
769                        let mut upper_val = String::from(lower_val);
770                        upper_val.push(std::char::MAX);
771                        guessed_count += self.guess_max_count(ns, name, lower_val, &upper_val)?;
772                    }
773                }
774            } else {
775                // For regular expressions without a prefix the worst case would be `.*[X].*` where `[X]` are the most common characters.
776                // Sample values from the histogram to get a better estimation of how many percent of the actual values could match.
777                if let Ok(pattern) = regex::Regex::new(&full_match_pattern) {
778                    let mut rng = rand::rng();
779                    let qualified_keys: Vec<_> = match ns {
780                        Some(ns) => vec![AnnoKey {
781                            name: name.into(),
782                            ns: ns.into(),
783                        }],
784                        None => self.get_qnames(name)?,
785                    }
786                    .into_iter()
787                    .filter_map(|key| self.anno_keys.get_symbol(&key).map(|symbol| (symbol, key)))
788                    .collect();
789                    for (anno_key_symbol, anno_key) in qualified_keys {
790                        let anno_size = self
791                            .anno_key_sizes
792                            .get(&anno_key)
793                            .copied()
794                            .unwrap_or_default();
795
796                        if let Some(histo) = self.histogram_bounds.get(&anno_key_symbol)
797                            && !histo.is_empty()
798                        {
799                            let sampled_values = histo.iter().choose_multiple(&mut rng, 20);
800                            let matches = sampled_values
801                                .iter()
802                                .filter(|v| pattern.is_match(v))
803                                .count();
804                            if sampled_values.len() == matches {
805                                // Assume all values match
806                                guessed_count += anno_size;
807                            } else if matches == 0 {
808                                // No match found, but use the bucket size as pessimistic guess
809                                guessed_count +=
810                                    (anno_size as f64 / sampled_values.len() as f64) as usize;
811                            } else {
812                                // Use the percent of matched values to guess the overall number
813                                let match_ratio = (matches as f64) / (sampled_values.len() as f64);
814                                guessed_count += ((anno_size as f64) * match_ratio) as usize;
815                            }
816                        }
817                    }
818                }
819            }
820
821            Ok(guessed_count.min(total))
822        } else {
823            Ok(0)
824        }
825    }
826
827    fn guess_most_frequent_value(
828        &self,
829        ns: Option<&str>,
830        name: &str,
831    ) -> Result<Option<Cow<'_, str>>> {
832        // find all complete keys which have the given name (and namespace if given)
833        let qualified_keys = match ns {
834            Some(ns) => vec![AnnoKey {
835                name: name.into(),
836                ns: ns.into(),
837            }],
838            None => self.get_qnames(name)?,
839        };
840
841        let mut sampled_values: HashMap<&str, usize> = HashMap::default();
842
843        // guess for each fully qualified annotation key
844        for anno_key in qualified_keys {
845            if let Some(anno_key) = self.anno_keys.get_symbol(&anno_key)
846                && let Some(histo) = self.histogram_bounds.get(&anno_key)
847            {
848                for v in histo.iter() {
849                    let count: &mut usize = sampled_values.entry(v).or_insert(0);
850                    *count += 1;
851                }
852            }
853        }
854        // find the value which is most frequent
855        if !sampled_values.is_empty() {
856            let mut max_count = 0;
857            let mut max_value = Cow::Borrowed("");
858            for (v, count) in sampled_values.into_iter() {
859                if count >= max_count {
860                    max_value = Cow::Borrowed(v);
861                    max_count = count;
862                }
863            }
864            Ok(Some(max_value))
865        } else {
866            Ok(None)
867        }
868    }
869
870    fn get_all_values(
871        &self,
872        key: &AnnoKey,
873        most_frequent_first: bool,
874    ) -> Result<Vec<Cow<'_, str>>> {
875        if let Some(key) = self.anno_keys.get_symbol(key)
876            && let Some(values_for_key) = self.by_anno.get(&key)
877        {
878            if most_frequent_first {
879                let result = values_for_key
880                    .iter()
881                    .filter_map(|(val, items)| {
882                        let val = self.anno_values.get_value_ref(*val)?;
883                        Some((items.len(), val))
884                    })
885                    .sorted()
886                    .rev()
887                    .map(|(_, val)| Cow::Borrowed(&val[..]))
888                    .collect();
889                return Ok(result);
890            } else {
891                let result = values_for_key
892                    .iter()
893                    .filter_map(|(val, _items)| self.anno_values.get_value_ref(*val))
894                    .map(|val| Cow::Borrowed(&val[..]))
895                    .collect();
896                return Ok(result);
897            }
898        }
899        Ok(vec![])
900    }
901
902    fn annotation_keys(&self) -> Result<Vec<AnnoKey>> {
903        Ok(self.anno_key_sizes.keys().cloned().collect())
904    }
905
906    fn get_largest_item(&self) -> Result<Option<T>> {
907        Ok(self.largest_item.clone())
908    }
909
910    fn calculate_statistics(&mut self) -> Result<()> {
911        let max_histogram_buckets = 250;
912        let max_sampled_annotations = 2500;
913
914        self.histogram_bounds.clear();
915
916        // collect statistics for each annotation key separately
917        for anno_key in self.anno_key_sizes.keys() {
918            if let Some(anno_key) = self.anno_keys.get_symbol(anno_key) {
919                // sample a maximal number of annotation values
920                let mut rng = rand::rng();
921                if let Some(values_for_key) = self.by_anno.get(&anno_key) {
922                    let sampled_anno_values: Vec<usize> = values_for_key
923                        .iter()
924                        .flat_map(|(val, items)| {
925                            // repeat value corresponding to the number of nodes with this annotation
926                            let v = vec![*val; items.len()];
927                            v.into_iter()
928                        })
929                        .collect();
930                    let sampled_anno_indexes: FxHashSet<usize> = rand::seq::index::sample(
931                        &mut rng,
932                        sampled_anno_values.len(),
933                        std::cmp::min(sampled_anno_values.len(), max_sampled_annotations),
934                    )
935                    .into_iter()
936                    .collect();
937
938                    let mut sampled_anno_values: Vec<String> = sampled_anno_values
939                        .into_iter()
940                        .enumerate()
941                        .filter(|x| sampled_anno_indexes.contains(&x.0))
942                        .filter_map(|x| self.anno_values.get_value_ref(x.1).cloned())
943                        .collect();
944                    // create uniformly distributed histogram bounds
945                    sampled_anno_values.sort();
946
947                    let num_hist_bounds = if sampled_anno_values.len() < (max_histogram_buckets + 1)
948                    {
949                        sampled_anno_values.len()
950                    } else {
951                        max_histogram_buckets + 1
952                    };
953
954                    let hist = self.histogram_bounds.entry(anno_key).or_default();
955
956                    if num_hist_bounds >= 2 {
957                        hist.resize(num_hist_bounds, String::from(""));
958
959                        let delta: usize = (sampled_anno_values.len() - 1) / (num_hist_bounds - 1);
960                        let delta_fraction: usize =
961                            (sampled_anno_values.len() - 1) % (num_hist_bounds - 1);
962
963                        let mut pos = 0;
964                        let mut pos_fraction = 0;
965                        for hist_item in hist.iter_mut() {
966                            *hist_item = sampled_anno_values[pos].clone();
967                            pos += delta;
968                            pos_fraction += delta_fraction;
969
970                            if pos_fraction >= (num_hist_bounds - 1) {
971                                pos += 1;
972                                pos_fraction -= num_hist_bounds - 1;
973                            }
974                        }
975                    }
976                }
977            }
978        }
979        Ok(())
980    }
981
982    fn load_annotations_from(&mut self, location: &Path) -> Result<()> {
983        // always remove all entries first, so even if there is an error the anno storage is empty
984        self.clear_internal();
985
986        let path = location.join("nodes_v1.bin");
987        let f = std::fs::File::open(path.clone()).map_err(|e| {
988            GraphAnnisCoreError::LoadingAnnotationStorage {
989                path: path.to_string_lossy().to_string(),
990                source: e,
991            }
992        })?;
993        let mut reader = std::io::BufReader::new(f);
994        *self = bincode::deserialize_from(&mut reader)?;
995
996        self.anno_keys.after_deserialization();
997        self.anno_values.after_deserialization();
998
999        Ok(())
1000    }
1001
1002    fn save_annotations_to(&self, location: &Path) -> Result<()> {
1003        let f = std::fs::File::create(location.join("nodes_v1.bin"))?;
1004        let mut writer = std::io::BufWriter::new(f);
1005        bincode::serialize_into(&mut writer, self)?;
1006
1007        Ok(())
1008    }
1009}
1010
1011impl NodeAnnotationStorage for AnnoStorageImpl<NodeID> {
1012    fn get_node_id_from_name(&self, node_name: &str) -> Result<Option<NodeID>> {
1013        if let (Some(anno_name_symbol), Some(value_symbol)) = (
1014            self.anno_keys.get_symbol(&NODE_NAME_KEY),
1015            self.anno_values.get_symbol(&String::from(node_name)),
1016        ) && let Some(items_with_anno) = self.by_anno.get(&anno_name_symbol)
1017            && let Some(items) = items_with_anno.get(&value_symbol)
1018        {
1019            return Ok(items.iter().copied().next());
1020        }
1021
1022        Ok(None)
1023    }
1024
1025    fn has_node_name(&self, node_name: &str) -> Result<bool> {
1026        if let (Some(anno_name_symbol), Some(value_symbol)) = (
1027            self.anno_keys.get_symbol(&NODE_NAME_KEY),
1028            self.anno_values.get_symbol(&String::from(node_name)),
1029        ) && let Some(items_with_anno) = self.by_anno.get(&anno_name_symbol)
1030            && let Some(items) = items_with_anno.get(&value_symbol)
1031        {
1032            return Ok(!items.is_empty());
1033        }
1034
1035        Ok(false)
1036    }
1037}
1038
1039impl EdgeAnnotationStorage for AnnoStorageImpl<Edge> {}
1040
1041impl AnnoStorageImpl<Edge> {
1042    pub fn after_deserialization(&mut self) {
1043        self.anno_keys.after_deserialization();
1044        self.anno_values.after_deserialization();
1045    }
1046}
1047
1048#[cfg(test)]
1049mod tests;